While executing systemtap tests with this week snapshot saw the following issue, ================= 0:mon> e cpu 0x0: Vector: 300 (Data Access) at [c0000000fa7c3940] pc: c00000000004998c: .memcmp+0x18/0x30 lr: d000000002785260: ._stp_ctl_write_cmd+0x250/0x840 [stap_3749988c31b9ae53b507a42e405bc1a8_575696] sp: c0000000fa7c3bc0 msr: 8000000000009032 dar: d0000000015c2064 dsisr: 40000000 current = 0xc0000000f793db00 paca = 0xc000000001482600 pid = 29521, comm = stapio 0:mon> t [link register ] d000000002785260 ._stp_ctl_write_cmd+0x250/0x840 [stap_3749988c31b9ae53b507a42e405bc1a8_575696] [c0000000fa7c3bc0] d00000000278581c ._stp_ctl_write_cmd+0x80c/0x840 [stap_3749988c31b9ae53b507a42e405bc1a8_575696] (unreliable) [c0000000fa7c3ce0] c0000000001ba72c .vfs_write+0xec/0x1f0 [c0000000fa7c3d80] c0000000001ba958 .SyS_write+0x58/0xb0 [c0000000fa7c3e30] c000000000008570 syscall_exit+0x0/0x40 --- Exception: c00 (System Call) at 00000fffb5fa66cc SP (fffe757eff0) is in userspace 0:mon> r R00 = 000000000000006b R16 = 0000000010017b78 R01 = c0000000fa7c3bc0 R17 = 00000fffe757f5b0 R02 = c000000001377d30 R18 = 0000000010008b18 R03 = 00000000000000ec R19 = 0000000010008770 R04 = d0000000015c2063 R20 = d0000000015b051c R05 = 0000000000000014 R21 = d000000002787918 R06 = d0000000027a8938 R22 = 0000000000000014 R07 = 0000000000000048 R23 = d0000000015c2078 R08 = 000000000000005a R24 = d0000000027885e0 R09 = d0000000027bcf00 R25 = d0000000027876c8 R10 = 0000000000000001 R26 = d0000000027c3000 R11 = c000000000049974 R27 = c0000000fa7c3c40 R12 = d0000000027864b0 R28 = 0000000000000008 R13 = c000000001482600 R29 = d000000002838ba0 R14 = 0000000010009fe0 R30 = d0000000027f0038 R15 = 0000000010011af8 R31 = d0000000027e8ba0 pc = c00000000004998c .memcmp+0x18/0x30 lr = d000000002785260 ._stp_ctl_write_cmd+0x250/0x840 [stap_3749988c31b9ae53b507a42e405bc1a8_575696] msr = 8000000000009032 cr = 44000444 ctr = 0000000000000014 xer = 0000000000000000 trap = 300 dar = d0000000015c2064 dsisr = 40000000 0:mon> di %pc c00000000004998c 8c040001 lbzu r0,1(r4) c000000000049990 7c601851 subf. r3,r0,r3 c000000000049994 4102fff4 bdnzt eq,c000000000049988 # .memcmp+0x14/0x30 c000000000049998 4e800020 blr c00000000004999c 38600000 li r3,0 c0000000000499a0 4e800020 blr c0000000000499a4 2c050000 cmpwi r5,0 c0000000000499a8 4081001c ble c0000000000499c4 # .memchr+0x20/0x28 c0000000000499ac 7ca903a6 mtctr r5 c0000000000499b0 3863ffff addi r3,r3,-1 c0000000000499b4 8c030001 lbzu r0,1(r3) c0000000000499b8 7c002000 cmpw r0,r4 c0000000000499bc 4002fff8 bdnzf eq,c0000000000499b4 # .memchr+0x10/0x28 c0000000000499c0 4d820020 beqlr c0000000000499c4 38600000 li r3,0 c0000000000499c8 4e800020 blr 0:mon> ========================
Created attachment 4302 [details] logs collected by autotest
If you still have access to a stap*.ko that exhibits this problem, could you "objdump -drS" it to disassemble it, to see where this memcmp() call is coming from? (We have one explicit call to it, in buildid checking, but that doesn't seem THAT close to the write_cmd path.)
Can recreate this with git (2.6.33-rc4-git5 ..6ccf80eb..). Here is the supporting info. stap_07505bf65828be510a2a08652a12f45b_4303: systemtap: 1.1/0.144, base: d0000000099f0000, memory: 29data/40text/145ctx/13net/24856alloc kb, probes: 4 Unable to handle kernel paging request for data at address 0xd000000001d02710 Faulting instruction address: 0xc00000000004460c Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=1024 DEBUG_PAGEALLOC NUMA pSeries Modules linked in: stap_5e64ec831c6c1c2fa9784d7081f8e5fa_7859 ipv6 fuse loop dm_mod sr_mod ibmveth cdrom sg sd_mod crc_t10dif ibmvscsic scsi_transport_srp scsi_tgt scsi_mod [last unloaded: stap_07505bf65828be510a2a08652a12f45b_4303] NIP: c00000000004460c LR: d000000009d563d4 CTR: 0000000000000014 REGS: c0000000e6513950 TRAP: 0300 Not tainted (2.6.33-rc4-git5-autotest) MSR: 8000000000009032 <EE,ME,IR,DR> CR: 44000424 XER: 00000001 DAR: d000000001d02710, DSISR: 0000000040010000 TASK = c0000000ec9fc050[18734] 'stapio' THREAD: c0000000e6510000 CPU: 17 GPR00: 000000000000006b c0000000e6513bd0 c000000000bb1e70 000000000000009d GPR04: d000000001d0270f 0000000000000014 d000000009d59315 0000000000000000 GPR08: 000000000000e99c d000000009d59200 c000000010001d20 c0000000000445f4 GPR12: d000000009d58160 c000000000c85080 0000000010008f48 00000fffedd0aa60 GPR16: 0000000010022670 0000000010008af0 0000000010008ad0 0000000000000008 GPR20: 0000000010022890 0000000010008ea0 0000000010003954 000000001001b018 GPR24: 00000fffedd0aa64 0000000010022668 d000000009d591f8 d000000009d59b34 GPR28: d000000001d02724 d000000001cf8638 d000000009d629b8 d000000009d5a540 NIP [c00000000004460c] .memcmp+0x18/0x30 LR [d000000009d563d4] ._stp_ctl_write_cmd+0x45c/0x98c [stap_5e64ec831c6c1c2fa9784d7081f8e5fa_7859] Call Trace: [c0000000e6513bd0] [d000000009d562e0] ._stp_ctl_write_cmd+0x368/0x98c [stap_5e64ec831c6c1c2fa9784d7081f8e5fa_7859] (unreliable) [c0000000e6513ce0] [c000000000182cd8] .vfs_write+0xd0/0x1dc [c0000000e6513d80] [c000000000182eec] .SyS_write+0x58/0xa0 [c0000000e6513e30] [c0000000000085b4] syscall_exit+0x0/0x40 Instruction dump: 3883ffff 8c040001 2c000000 4082fff8 7c632050 4e800020 2c050000 40c10024 7ca903a6 38c3ffff 3884ffff 8c660001 <8c040001> 7c601851 4102fff4 4e800020 ---[ end trace 21b12feeeff20bee ]--- objdump of the corresponding module shows this following code 63b8: 7f 9d 2a 14 add r28,r29,r5 } /* build-id note payload start address */ /* XXX: But see https://bugzilla.redhat.com/show_bug.cgi?id=465872; dwfl_module_build_id was not intended to return the end address. */ notes_addr -= m->build_id_len; 63bc: e8 bf 00 7a lwa r5,120(r31) 63c0: 7c 85 e0 50 subf r4,r5,r28 if (notes_addr <= base_addr) /* shouldn't happen */ 63c4: 7f a4 e8 40 cmpld cr7,r4,r29 63c8: 40 9d ff 48 ble+ cr7,6310 <._stp_ctl_write_cmd+0x398> continue; if (memcmp(m->build_id_bits, (unsigned char*) notes_addr, m->build_id_len)) { 63cc: e8 7f 00 60 ld r3,96(r31) 63d0: 48 00 00 01 bl 63d0 <._stp_ctl_write_cmd+0x458> 63d0: R_PPC64_REL24 .memcmp 63d4: 60 00 00 00 nop 63d8: 2f a3 00 00 cmpdi cr7,r3,0 63dc: 41 9e ff 34 beq+ cr7,6310 <._stp_ctl_write_cmd+0x398> const char *basename; basename = strrchr(m->path, '/'); Attaching the objdump o/p.
Created attachment 4549 [details] objdump -drS output
It appears as though one of the memcmp address parameters used for buildid verification may be wrong. Without further symbol data, I don't have a theory as to why that would be, but we might try ... - changing the stap code to emit a loop of kread()s to compare the buildid bytes, so in case of an address miscalculation, it will fail cleanly - disabling the buildid comparison generation code in the translator
Please see if commit #f4191849c improves the situation.
Please check whether commits 71fa1fe39 and 77123275 fix the problem.
*** Bug 11814 has been marked as a duplicate of this bug. ***
Believed fixed.