diff mbox

[EDACv16,1/2] edac: Change internal representation to work with layers

Message ID 4F9ABCEC.9090807@redhat.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Mauro Carvalho Chehab April 27, 2012, 3:36 p.m. UTC
Em 27-04-2012 10:33, Borislav Petkov escreveu:
> Btw,
> 
> this patch gives
> 
> [    8.278399] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0
> [    8.287594] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1
> [    8.296784] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: dimm2 (1:0:0): row 1, chan 0
> [    8.305968] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: dimm3 (1:1:0): row 1, chan 1
> [    8.315144] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: dimm4 (2:0:0): row 2, chan 0
> [    8.324326] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: dimm5 (2:1:0): row 2, chan 1
> [    8.333502] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: dimm6 (3:0:0): row 3, chan 0
> [    8.342684] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: dimm7 (3:1:0): row 3, chan 1
> [    8.351860] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: dimm8 (4:0:0): row 4, chan 0
> [    8.361049] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: dimm9 (4:1:0): row 4, chan 1
> [    8.370227] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: dimm10 (5:0:0): row 5, chan 0
> [    8.379582] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: dimm11 (5:1:0): row 5, chan 1
> [    8.388941] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: dimm12 (6:0:0): row 6, chan 0
> [    8.398315] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: dimm13 (6:1:0): row 6, chan 1
> [    8.407680] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: dimm14 (7:0:0): row 7, chan 0
> [    8.417047] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: dimm15 (7:1:0): row 7, chan 1
> 
> and the memory controller has the following chip selects
> 
> [    8.137662] EDAC MC: DCT0 chip selects:
> [    8.150291] EDAC amd64: MC: 0:  2048MB 1:  2048MB
> [    8.155349] EDAC amd64: MC: 2:  2048MB 3:  2048MB
> [    8.160408] EDAC amd64: MC: 4:     0MB 5:     0MB
> [    8.165475] EDAC amd64: MC: 6:     0MB 7:     0MB
> [    8.180499] EDAC MC: DCT1 chip selects:
> [    8.184693] EDAC amd64: MC: 0:  2048MB 1:  2048MB
> [    8.189753] EDAC amd64: MC: 2:  2048MB 3:  2048MB
> [    8.194812] EDAC amd64: MC: 4:     0MB 5:     0MB
> [    8.199875] EDAC amd64: MC: 6:     0MB 7:     0MB
> 
> Those are 4 dual-ranked DIMMs on this node, DCT0 is one channel and DCT1
> is another and I have 4 ranks per channel. Having dimm0-dimm15 is very
> misleading and has nothing to do with the reality. So, if this is to use
> your nomenclature with layers, I'll have dimm0-dimm7 where each dimm is
> a rank.
> 
> Or, the most correct thing to do would be to have dimm0-dimm3, each
> dual-ranked.
> 
> So either tot_dimms is computed wrongly or there's a more serious error
> somewhere.
> 
> I've reviewed almost the half patch, will review the rest when/if we
> sort out the above issue first.
> 
> Thanks.


The fix for it were in another patch[1], as calling them as "rank" is needed
also at the sysfs API.

[1] http://lists-archives.com/linux-kernel/27623222-edac-add-a-new-per-dimm-api-and-make-the-old-per-virtual-rank-api-obsolete.html

I can just merge the fix on this patch, with the enclosed diff.

Regards,
Mauro

Comments

Borislav Petkov April 28, 2012, 9:05 a.m. UTC | #1
On Fri, Apr 27, 2012 at 12:36:12PM -0300, Mauro Carvalho Chehab wrote:
> The fix for it were in another patch[1], as calling them as "rank" is
> needed also at the sysfs API.

No, this doesn't fix it either:

[   10.486440] EDAC MC: DCT0 chip selects:
[   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
[   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
[   10.486448] EDAC amd64: MC: 4:     0MB 5:     0MB
[   10.486450] EDAC amd64: MC: 6:     0MB 7:     0MB
[   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088
[   10.486455] EDAC MC: DCT1 chip selects:
[   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
[   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
[   10.486463] EDAC amd64: MC: 4:     0MB 5:     0MB
[   10.486465] EDAC amd64: MC: 6:     0MB 7:     0MB
[   10.486467] EDAC amd64: using x8 syndromes.
[   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100
[   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all DIMMs support ECC: yes
[   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
[   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 64b
[   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no
[   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding
[   10.486488] EDAC amd64: MCT channel count: 2
[   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
[   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0
[   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1
[   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0
[   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1
[   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0
[   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1
[   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0
[   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1
[   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0
[   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1
[   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0
[   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1
[   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0
[   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1
[   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0
[   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1

DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.

Now your change is showing 16 ranks. Still b0rked.
Mauro Carvalho Chehab April 29, 2012, 1:49 p.m. UTC | #2
Em 28-04-2012 06:05, Borislav Petkov escreveu:
> On Fri, Apr 27, 2012 at 12:36:12PM -0300, Mauro Carvalho Chehab wrote:
>> The fix for it were in another patch[1], as calling them as "rank" is
>> needed also at the sysfs API.
> 
> No, this doesn't fix it either:
> 
> [   10.486440] EDAC MC: DCT0 chip selects:
> [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
> [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
> [   10.486448] EDAC amd64: MC: 4:     0MB 5:     0MB
> [   10.486450] EDAC amd64: MC: 6:     0MB 7:     0MB
> [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088
> [   10.486455] EDAC MC: DCT1 chip selects:
> [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
> [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
> [   10.486463] EDAC amd64: MC: 4:     0MB 5:     0MB
> [   10.486465] EDAC amd64: MC: 6:     0MB 7:     0MB
> [   10.486467] EDAC amd64: using x8 syndromes.
> [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100
> [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all DIMMs support ECC: yes
> [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
> [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 64b
> [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no
> [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding
> [   10.486488] EDAC amd64: MCT channel count: 2
> [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
> [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0
> [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1
> [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0
> [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1
> [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0
> [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1
> [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0
> [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1
> [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0
> [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1
> [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0
> [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1
> [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0
> [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1
> [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0
> [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1
> 
> DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.
> 
> Now your change is showing 16 ranks. Still b0rked.
> 
No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.

As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc
doesn't know how many ranks are filled, as the driver logic first calls it to 
allocate for the max amount of ranks, and then fills the rank with their info 
(or let them untouched with 0 pages, if they're empty).

Regards,
Mauro
Borislav Petkov April 30, 2012, 8:15 a.m. UTC | #3
On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote:
> > [   10.486440] EDAC MC: DCT0 chip selects:
> > [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
> > [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
> > [   10.486448] EDAC amd64: MC: 4:     0MB 5:     0MB
> > [   10.486450] EDAC amd64: MC: 6:     0MB 7:     0MB
> > [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088
> > [   10.486455] EDAC MC: DCT1 chip selects:
> > [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
> > [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
> > [   10.486463] EDAC amd64: MC: 4:     0MB 5:     0MB
> > [   10.486465] EDAC amd64: MC: 6:     0MB 7:     0MB
> > [   10.486467] EDAC amd64: using x8 syndromes.
> > [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100
> > [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all DIMMs support ECC: yes
> > [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
> > [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 64b
> > [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no
> > [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding
> > [   10.486488] EDAC amd64: MCT channel count: 2
> > [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
> > [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0
> > [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1
> > [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0
> > [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1
> > [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0
> > [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1
> > [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0
> > [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1
> > [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0
> > [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1
> > [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0
> > [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1
> > [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0
> > [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1
> > [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0
> > [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1
> > 
> > DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.
> > 
> > Now your change is showing 16 ranks. Still b0rked.
> > 
> No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.
> 
> As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc
> doesn't know how many ranks are filled, as the driver logic first calls it to 
> allocate for the max amount of ranks, and then fills the rank with their info 
> (or let them untouched with 0 pages, if they're empty).

Basically you're saying you're generating dimm_info structs for all
_possible_ dimms and the loop where this debug message comes from goes
and marrily initializes them all although some of them are empty:

+       for (i = 0; i < tot_dimms; i++) {
+               chan = &csi[row].channels[chn];
+               dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers,
+                              pos[0], pos[1], pos[2]);
+               dimm->mci = mci;
+
+               debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__,
+                       i, (dimm - mci->dimms),
+                       pos[0], pos[1], pos[2], row, chn);
+
+               /* Copy DIMM location */
+               for (j = 0; j < n_layers; j++)
+                       dimm->location[j] = pos[j];
...

definitely superfluous.

Oh well, looking at edac_mc_alloc, it used to allocate structs for all
csrows on the controller even though some of them were empty...

Ok, then please remove this debug call because it is misleading. Having

[   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)

is enough.

You probably want to say how many channels/csrows there are, though:

[   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 8 csrows, 2 channels)

or something similar. Simply dump tot_dimms, tot_channels and tot_csrows
and that's it.
Mauro Carvalho Chehab April 30, 2012, 10:58 a.m. UTC | #4
Em 30-04-2012 05:15, Borislav Petkov escreveu:
> On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote:
>>> [   10.486440] EDAC MC: DCT0 chip selects:
>>> [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
>>> [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
>>> [   10.486448] EDAC amd64: MC: 4:     0MB 5:     0MB
>>> [   10.486450] EDAC amd64: MC: 6:     0MB 7:     0MB
>>> [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088
>>> [   10.486455] EDAC MC: DCT1 chip selects:
>>> [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
>>> [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
>>> [   10.486463] EDAC amd64: MC: 4:     0MB 5:     0MB
>>> [   10.486465] EDAC amd64: MC: 6:     0MB 7:     0MB
>>> [   10.486467] EDAC amd64: using x8 syndromes.
>>> [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100
>>> [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all DIMMs support ECC: yes
>>> [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
>>> [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 64b
>>> [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no
>>> [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding
>>> [   10.486488] EDAC amd64: MCT channel count: 2
>>> [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
>>> [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0
>>> [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1
>>> [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0
>>> [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1
>>> [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0
>>> [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1
>>> [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0
>>> [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1
>>> [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0
>>> [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1
>>> [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0
>>> [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1
>>> [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0
>>> [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1
>>> [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0
>>> [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1
>>>
>>> DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.
>>>
>>> Now your change is showing 16 ranks. Still b0rked.
>>>
>> No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.
>>
>> As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc
>> doesn't know how many ranks are filled, as the driver logic first calls it to 
>> allocate for the max amount of ranks, and then fills the rank with their info 
>> (or let them untouched with 0 pages, if they're empty).
> 
> Basically you're saying you're generating dimm_info structs for all
> _possible_ dimms and the loop where this debug message comes from goes
> and marrily initializes them all although some of them are empty:
> 
> +       for (i = 0; i < tot_dimms; i++) {
> +               chan = &csi[row].channels[chn];
> +               dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers,
> +                              pos[0], pos[1], pos[2]);
> +               dimm->mci = mci;
> +
> +               debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__,
> +                       i, (dimm - mci->dimms),
> +                       pos[0], pos[1], pos[2], row, chn);
> +
> +               /* Copy DIMM location */
> +               for (j = 0; j < n_layers; j++)
> +                       dimm->location[j] = pos[j];
> ...
> 
> definitely superfluous.
> 
> Oh well, looking at edac_mc_alloc, it used to allocate structs for all
> csrows on the controller even though some of them were empty...
> 
> Ok, then please remove this debug call because it is misleading. Having
> 
> [   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
> 
> is enough.
> 
> You probably want to say how many channels/csrows there are, though:
> 
> [   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 8 csrows, 2 channels)
> 
> or something similar. Simply dump tot_dimms, tot_channels and tot_csrows
> and that's it.
> 

It seems you have a very short memory. We had a similar discussion about that a while ago:
	https://lkml.org/lkml/2012/3/8/440

See my comments at:
	https://lkml.org/lkml/2012/3/9/101
	https://lkml.org/lkml/2012/3/9/267

As it was explained there, those debug messages provide a map between the legacy per-csrow
data, used by the old API and the dimm_info representation. For a per-csrow memory controller,
the map is trivial, as the memory location will match the csrow/channel information, but
for modern memory controllers, the map info is not trivial and it helps to check what it is
expected to be found when retrieving information via the legacy EDAC API.

For example, this is the mapping used by the second memory controller of the SB machine
I'm using on my tests:

[52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
...
[52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels)
[52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms
[52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0
[52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1
[52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2
[52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3
[52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0
[52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1
[52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2
[52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3
[52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0
[52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1
[52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2
[52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3

With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is
called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy
EDAC API call.

Regards,
Mauro
Borislav Petkov April 30, 2012, 11:11 a.m. UTC | #5
On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:
> It seems you have a very short memory.

Oh, puh-lease, let's don't start with the insults now. You're not a
saint yourself. And maybe the fact that I'm having hard time grasping
your code is maybe because it is a load of crap and you seem to generate
a lot of senseless drivel when explaining what it does. And don't get me
started on the patch bombs.

So, let's stay constructive here before I, as the last and only one
person reviewing this stinking pile stops messing with it (I got other
stuff to do, you know) and NACK it completely.

> For example, this is the mapping used by the second memory controller of the SB machine
> I'm using on my tests:
> 
> [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
> ...
> [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels)
> [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms
> [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0
> [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1
> [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2
> [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3
> [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0
> [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1
> [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2
> [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3
> [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0
> [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1
> [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2
> [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3
> 
> With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is
> called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy
> EDAC API call.

Are all those DIMM slots above populated? What happens if they're not,
are you issuing the same dimm0-dimm11 lines for slots which aren't even
populated?

I have a much better idea: Generally, this debug info should come from
the specific driver that allocates the dimm descriptors, not from the
EDAC core. This way, you know in the driver which slots are populated
and those which are not should be omitted.

This way it says "initializing 12 dimms" and the user thinks there are
12 DIMMs on his system where this might not be true.
Mauro Carvalho Chehab April 30, 2012, 11:37 a.m. UTC | #6
Em 30-04-2012 05:15, Borislav Petkov escreveu:
> On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote:
>>> [   10.486440] EDAC MC: DCT0 chip selects:
>>> [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
>>> [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
>>> [   10.486448] EDAC amd64: MC: 4:     0MB 5:     0MB
>>> [   10.486450] EDAC amd64: MC: 6:     0MB 7:     0MB
>>> [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088
>>> [   10.486455] EDAC MC: DCT1 chip selects:
>>> [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
>>> [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
>>> [   10.486463] EDAC amd64: MC: 4:     0MB 5:     0MB
>>> [   10.486465] EDAC amd64: MC: 6:     0MB 7:     0MB
>>> [   10.486467] EDAC amd64: using x8 syndromes.
>>> [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100
>>> [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all DIMMs support ECC: yes
>>> [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
>>> [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 64b
>>> [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no
>>> [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding
>>> [   10.486488] EDAC amd64: MCT channel count: 2
>>> [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
>>> [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0
>>> [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1
>>> [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0
>>> [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1
>>> [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0
>>> [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1
>>> [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0
>>> [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1
>>> [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0
>>> [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1
>>> [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0
>>> [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1
>>> [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0
>>> [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1
>>> [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0
>>> [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1
>>>
>>> DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.
>>>
>>> Now your change is showing 16 ranks. Still b0rked.
>>>
>> No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.
>>
>> As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc
>> doesn't know how many ranks are filled, as the driver logic first calls it to 
>> allocate for the max amount of ranks, and then fills the rank with their info 
>> (or let them untouched with 0 pages, if they're empty).
> 
> Basically you're saying you're generating dimm_info structs for all
> _possible_ dimms and the loop where this debug message comes from goes
> and marrily initializes them all although some of them are empty:
> 
> +       for (i = 0; i < tot_dimms; i++) {
> +               chan = &csi[row].channels[chn];
> +               dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers,
> +                              pos[0], pos[1], pos[2]);
> +               dimm->mci = mci;
> +
> +               debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__,
> +                       i, (dimm - mci->dimms),
> +                       pos[0], pos[1], pos[2], row, chn);
> +
> +               /* Copy DIMM location */
> +               for (j = 0; j < n_layers; j++)
> +                       dimm->location[j] = pos[j];
> ...
> 
> definitely superfluous.

This is the way the EDAC core works: everything is allocated, on one shot, when this
function is called, and, on most drivers, before the code that probes how many DIMMS/ranks
got initialized. That happens because the edac_mc_alloc() arguments provide the total
amount of ranks/dimms, but doesn't say anything about what is used there.

Changing from this model to another model that would dynamically initialize the per-dimm/rank
data is possible, but that would require another set of patches that will touch on all
drivers, and to convert the edac_mc_alloc function into 3 or 4 function calls, with the
corresponding changes on all drivers. Also, the changes at the drivers won't likely be
trivial.

The patches that convert kobj into "struct device" does part of the job, as, after it,
each dimm/csrow will be allocated by a separate kmalloc.

After having this series fully applied, it would be possible to work on such solution.
I'll eventually do that, as this would simplify the code at i7core_edac and sb_edac.

Regards,
Mauro
Mauro Carvalho Chehab April 30, 2012, 11:45 a.m. UTC | #7
Em 30-04-2012 08:11, Borislav Petkov escreveu:
> On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:

>> For example, this is the mapping used by the second memory controller of the SB machine
>> I'm using on my tests:
>>
>> [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
>> ...
>> [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels)
>> [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms
>> [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0
>> [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1
>> [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2
>> [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3
>> [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0
>> [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1
>> [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2
>> [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3
>> [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0
>> [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1
>> [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2
>> [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3
>>
>> With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is
>> called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy
>> EDAC API call.
> 
> Are all those DIMM slots above populated? What happens if they're not,
> are you issuing the same dimm0-dimm11 lines for slots which aren't even
> populated?
> 
> I have a much better idea: Generally, this debug info should come from
> the specific driver that allocates the dimm descriptors, not from the
> EDAC core. This way, you know in the driver which slots are populated
> and those which are not should be omitted.

The drivers don't allocate the dimm descriptors. They're allocated by the
core.

> This way it says "initializing 12 dimms" and the user thinks there are
> 12 DIMMs on his system where this might not be true.


I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything
new.

With regards do the other messages, if the debug messages are not clear, 
then let's fix them, instead of removing. What if we print, instead,
on a message like:

	"row 1, chan 1 will represent dimm5 (1:2:0) if not empty"

Regards,
Mauro
Borislav Petkov April 30, 2012, 12:38 p.m. UTC | #8
On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote:
> Em 30-04-2012 08:11, Borislav Petkov escreveu:
> > On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:
> 
> >> For example, this is the mapping used by the second memory controller of the SB machine
> >> I'm using on my tests:
> >>
> >> [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
> >> ...
> >> [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels)
> >> [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms
> >> [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0
> >> [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1
> >> [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2
> >> [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3
> >> [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0
> >> [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1
> >> [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2
> >> [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3
> >> [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0
> >> [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1
> >> [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2
> >> [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3
> >>
> >> With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is
> >> called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy
> >> EDAC API call.
> > 
> > Are all those DIMM slots above populated? What happens if they're not,
> > are you issuing the same dimm0-dimm11 lines for slots which aren't even
> > populated?
> > 
> > I have a much better idea: Generally, this debug info should come from
> > the specific driver that allocates the dimm descriptors, not from the
> > EDAC core. This way, you know in the driver which slots are populated
> > and those which are not should be omitted.
> 
> The drivers don't allocate the dimm descriptors. They're allocated by the
> core.

I know that. The drivers call into EDAC core using edac_mc_alloc, this
is what I meant above.

> > This way it says "initializing 12 dimms" and the user thinks there are
> > 12 DIMMs on his system where this might not be true.
> 
> 
> I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything
> new.
> 
> With regards do the other messages, if the debug messages are not clear, 
> then let's fix them, instead of removing. What if we print, instead,
> on a message like:
> 
> 	"row 1, chan 1 will represent dimm5 (1:2:0) if not empty"

How about the following instead: the specific driver calls
edac_mc_alloc(), it gets the allocated dimm array in mci->dimms
_without_ dumping each dimm%d line. Then, each driver figures out which
subset of that dimms array actually has populated slots and prints only
the populated rank/slot/...

This information is much more valuable than saying how many _possible_
slots the edac core has allocated.

Then, each driver can decide whether it makes sense to dump that info or
not.
Mauro Carvalho Chehab April 30, 2012, 1 p.m. UTC | #9
Em 30-04-2012 09:38, Borislav Petkov escreveu:
> On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote:
>> Em 30-04-2012 08:11, Borislav Petkov escreveu:
>>> On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:

>>> This way it says "initializing 12 dimms" and the user thinks there are
>>> 12 DIMMs on his system where this might not be true.
>>
>>
>> I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything
>> new.
>>
>> With regards do the other messages, if the debug messages are not clear, 
>> then let's fix them, instead of removing. What if we print, instead,
>> on a message like:
>>
>> 	"row 1, chan 1 will represent dimm5 (1:2:0) if not empty"
> 
> How about the following instead: the specific driver calls
> edac_mc_alloc(), it gets the allocated dimm array in mci->dimms
> _without_ dumping each dimm%d line. Then, each driver figures out which
> subset of that dimms array actually has populated slots and prints only
> the populated rank/slot/...
> 
> This information is much more valuable than saying how many _possible_
> slots the edac core has allocated.
> 
> Then, each driver can decide whether it makes sense to dump that info or
> not.

No, that would add extra complexity at the drivers level just due to debug
messages. I think that the better is to move this printk to the debug-specific
routine that is called only when the dimm is filled (edac_mc_dump_dimm).

With this cange, the message will be printed only for the filled dimms.

This is a cleanup patch, so I'll write it, together with the change that
will get rid of the loop that uses KERN_CONT. It will use a function added
by a latter patch at edac_mc_sysfs so it can't be merged on this patch
anyway.

Regards,
Mauro
diff mbox

Patch

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 4d4d8b7..e0d9481 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -86,7 +86,7 @@  static void edac_mc_dump_mci(struct mem_ctl_info *mci)
 	debugf4("\tmci->edac_check = %p\n", mci->edac_check);
 	debugf3("\tmci->nr_csrows = %d, csrows = %p\n",
 		mci->nr_csrows, mci->csrows);
-	debugf3("\tmci->nr_dimms = %d, dimns = %p\n",
+	debugf3("\tmci->nr_dimms = %d, dimms = %p\n",
 		mci->tot_dimms, mci->dimms);
 	debugf3("\tdev = %p\n", mci->dev);
 	debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name);
@@ -183,10 +183,6 @@  void *edac_align_ptr(void **p, unsigned size, int n_elems)
  * @size_pvt:		size of private storage needed
  *
  *
- * FIXME: drivers handle multi-rank memories on different ways: on some
- * drivers, one multi-rank memory is mapped as one DIMM, while, on others,
- * a single multi-rank DIMM would be mapped into several "dimms".
- *
  * Non-csrow based drivers (like FB-DIMM and RAMBUS ones) will likely report
  * such DIMMS properly, but the CSROWS-based ones will likely do the wrong
  * thing, as two chip select values are used for dual-rank memories (and 4, for
@@ -201,6 +197,12 @@  void *edac_align_ptr(void **p, unsigned size, int n_elems)
  *
  * Use edac_mc_free() to free mc structures allocated by this function.
  *
+ * NOTE: drivers handle multi-rank memories on different ways: on some
+ * drivers, one multi-rank memory is mapped as one entry, while, on others,
+ * a single multi-rank DIMM would be mapped into several entries. Currently,
+ * this function will allocate multiple struct dimm_info on such scenarios,
+ * as grouping the multiple ranks require drivers change.
+ *
  * Returns:
  *	NULL allocation failed
  *	struct mem_ctl_info pointer
@@ -220,10 +222,11 @@  struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
 	void *pvt;
 	unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
-	unsigned tot_csrows, tot_cschannels;
+	unsigned tot_csrows, tot_cschannels, tot_errcount = 0;
 	int i, j;
 	int err;
 	int row, chn;
+	bool per_rank = false;
 
 	BUG_ON(n_layers > EDAC_MAX_LAYERS);
 	/*
@@ -239,6 +242,9 @@  struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 			tot_csrows *= layers[i].size;
 		else
 			tot_cschannels *= layers[i].size;
+
+		if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
+			per_rank = true;
 	}
 
 	/* Figure out the offsets of the various items from the start of an mc
@@ -254,14 +260,21 @@  struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 	count = 1;
 	for (i = 0; i < n_layers; i++) {
 		count *= layers[i].size;
+		debugf4("%s: errcount layer %d size %d\n", __func__, i, count);
 		ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
 		ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
+		tot_errcount += 2 * count;
 	}
+
+	debugf4("%s: allocating %d error counters\n", __func__, tot_errcount);
 	pvt = edac_align_ptr(&ptr, sz_pvt, 1);
 	size = ((unsigned long)pvt) + sz_pvt;
 
-	debugf1("%s(): allocating %u bytes for mci data (%d dimms, %d csrows/channels)\n",
-		__func__, size, tot_dimms, tot_csrows * tot_cschannels);
+	debugf1("%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
+		__func__, size,
+		tot_dimms,
+		per_rank ? "ranks" : "dimms",
+		tot_csrows * tot_cschannels);
 	mci = kzalloc(size, GFP_KERNEL);
 	if (mci == NULL)
 		return NULL;
@@ -290,6 +303,7 @@  struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 	memcpy(mci->layers, layers, sizeof(*lay) * n_layers);
 	mci->nr_csrows = tot_csrows;
 	mci->num_cschannel = tot_cschannels;
+	mci->mem_is_per_rank = per_rank;
 
 	/*
 	 * Fills the csrow struct
@@ -315,15 +329,16 @@  struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 	memset(&pos, 0, sizeof(pos));
 	row = 0;
 	chn = 0;
-	debugf4("%s: initializing %d dimms\n", __func__, tot_dimms);
+	debugf4("%s: initializing %d %s\n", __func__, tot_dimms,
+		per_rank ? "ranks" : "dimms");
 	for (i = 0; i < tot_dimms; i++) {
 		chan = &csi[row].channels[chn];
 		dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers,
 			       pos[0], pos[1], pos[2]);
 		dimm->mci = mci;
 
-		debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__,
-			i, (dimm - mci->dimms),
+		debugf2("%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n", __func__,
+			i, per_rank ? "rank" : "dimm", (dimm - mci->dimms),
 			pos[0], pos[1], pos[2], row, chn);
 
 		/* Copy DIMM location */
@@ -1040,8 +1055,10 @@  void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 			 * get csrow/channel of the dimm, in order to allow
 			 * incrementing the compat API counters
 			 */
-			debugf4("%s: dimm csrows (%d,%d)\n",
-				__func__, dimm->csrow, dimm->cschannel);
+			debugf4("%s: %s csrows map: (%d,%d)\n",
+				__func__,
+				mci->mem_is_per_rank ? "rank" : "dimm",
+				dimm->csrow, dimm->cschannel);
 			if (row == -1)
 				row = dimm->csrow;
 			else if (row >= 0 && row != dimm->csrow)
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 412d5cd..2b66109 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -555,6 +555,8 @@  struct mem_ctl_info {
 	/* Memory Controller hierarchy */
 	unsigned n_layers;
 	struct edac_mc_layer *layers;
+	bool mem_is_per_rank;
+
 	/*
 	 * DIMM info. Will eventually remove the entire csrows_info some day
 	 */