| /* Copyright 2013 Google Inc. All Rights Reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| |
| /* |
| * This "tool" can be used to brute force the XOR bitmask that a memory |
| * controller uses to interleave addresses onto its two channels. To use it, |
| * you need to have a bunch of addresses that are known to go to only one |
| * of the memory channels... easiest way to get these is to run stressapptest on |
| * a machine while holding a soldering iron close to the chips of one channel. |
| * Generate about a thousand failures and extract their physical addresses |
| * from the output. Write them to findmask.inc in a way that forms a valid |
| * definition for the addrs array. Make and run on a big machine. |
| * |
| * The program iterates over all possible bitmasks within the first NUM_BITS, |
| * parallelizing execution over NUM_THREADS. Every integer is masked |
| * onto all supplied addresses, counting the amount of times this results in |
| * an odd or even amount of bits. If all but NOISE addresses fall on one side, |
| * it will print that mask to stdout. Note that the script will always "find" |
| * the mask 0x0, and may also report masks such as 0x100000000 depending on |
| * your test machines memory size... you will need to use your own judgement to |
| * interpret the results. |
| * |
| * As the program might run for a long time, you can send SIGUSR1 to it to |
| * output the last mask that was processed and get a rough idea of the |
| * current progress. |
| */ |
| |
| #include <inttypes.h> |
| #include <pthread.h> |
| #include <signal.h> |
| #include <stdint.h> |
| #include <stdlib.h> |
| #include <stdio.h> |
| |
| #define NOISE 20 |
| #define NUM_BITS 32 |
| #define NUM_THREADS 128 // keep this a power of two |
| |
| static uint64_t addrs[] = { |
| #include "findmask.inc" |
| }; |
| static uint64_t lastmask; |
| |
| __attribute__((optimize(3, "unroll-loops"))) |
| void* thread_func(void* arg) { |
| register uint64_t mask; |
| register uintptr_t num = (uintptr_t)arg; |
| |
| for (mask = num; mask < (1ULL << (NUM_BITS + 1)); mask += NUM_THREADS) { |
| register const uint64_t* cur; |
| register int a = 0; |
| register int b = 0; |
| |
| for (cur = addrs; (char*)cur < (char*)addrs + sizeof(addrs); cur++) { |
| #ifdef __x86_64__ |
| register uint64_t addr asm("rdx") = *cur & mask; |
| register uint32_t tmp asm("ebx"); |
| |
| // Behold: the dark bit counting magic! |
| asm ( |
| // Fold high and low 32 bits onto each other |
| "MOVl %%edx, %%ebx\n\t" |
| "SHRq $32, %%rdx\n\t" |
| "XORl %%ebx, %%edx\n\t" |
| // Fold high and low 16 bits onto each other |
| "MOVl %%edx, %%ebx\n\t" |
| "SHRl $16, %%edx\n\t" |
| "XORw %%bx, %%dx\n\t" |
| // Fold high and low 8 bits onto each other |
| "XORb %%dh, %%dl\n\t" |
| // Invoke ancient 8086 parity flag (only counts lowest byte) |
| "SETnp %%bl\n\t" |
| "SETp %%dl\n\t" |
| // Stupid SET instruction can only affect the lowest byte... |
| "ANDl $1, %%ebx\n\t" |
| "ANDl $1, %%edx\n\t" |
| // Increment either 'a' or 'b' without needing another branch |
| "ADDl %%ebx, %2\n\t" |
| "ADDl %%edx, %1\n\t" |
| : "=b" (tmp), "+r"(a), "+r"(b) : "d"(addr) : "cc"); |
| |
| #else // generic processor |
| register uint64_t addr = *cur & mask; |
| register uint32_t low = (uint32_t)addr; |
| register uint32_t high = (uint32_t)(addr >> 32); |
| |
| // Takes about twice as long as the version above... take that GCC! |
| __builtin_parity(low) ^ __builtin_parity(high) ? a++ : b++; |
| #endif |
| |
| // Early abort: probably still the most valuable optimization in here |
| if (a >= NOISE && b >= NOISE) break; |
| } |
| |
| if (a < NOISE) b = a; |
| if (b < NOISE) { |
| printf("Found mask with just %d deviations: 0x%" PRIx64 "\n", b, mask); |
| fflush(stdout); |
| } |
| |
| // I'm a little paranoid about performance: don't write to memory too often |
| if (!(mask & 0x7ff)) lastmask = mask; |
| } |
| |
| return 0; |
| } |
| |
| void signal_handler(int signum) { |
| printf("Received signal... currently evaluating mask 0x%" PRIx64 "!\n", |
| lastmask); |
| fflush(stdout); |
| } |
| |
| int main(int argc, char** argv) { |
| uintptr_t i; |
| pthread_t threads[NUM_THREADS]; |
| |
| signal(SIGUSR1, signal_handler); |
| |
| for (i = 0; i < NUM_THREADS; i++) |
| pthread_create(&threads[i], 0, thread_func, (void*)i); |
| |
| for (i = 0; i < NUM_THREADS; i++) |
| pthread_join(threads[i], 0); |
| |
| return 0; |
| } |