| // Copyright 2008 Google Inc. All Rights Reserved. |
| |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "adler32memcpy.h" |
| |
| // We are using (a modified form of) adler-32 checksum algorithm instead |
| // of CRC since adler-32 is faster than CRC. |
| // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/) |
| // This form of adler is bit modified, instead of treating the data in |
| // units of bytes, 32-bit data is taken as a unit and two 64-bit |
| // checksums are done (we could have one checksum but two checksums |
| // make the code run faster). |
| |
| // Adler-32 implementation: |
| // Data is treated as 1-byte numbers and, |
| // there are two 16-bit numbers a and b |
| // Initialize a with 1 and b with 0. |
| // for each data unit 'd' |
| // a += d |
| // b += a |
| // checksum = a<<16 + b |
| // This sum should never overflow. |
| // |
| // Adler-64+64 implementation: |
| // (applied in this code) |
| // Data is treated as 32-bit numbers and whole data is separated into two |
| // streams, and hence the two checksums a1, a2, b1 and b2. |
| // Initialize a1 and a2 with 1, b1 and b2 with 0 |
| // add first dataunit to a1 |
| // add a1 to b1 |
| // add second dataunit to a1 |
| // add a1 to b1 |
| // add third dataunit to a2 |
| // add a2 to b2 |
| // add fourth dataunit to a2 |
| // add a2 to b2 |
| // ... |
| // repeat the sequence back for next 4 dataunits |
| // |
| // variable A = XMM6 and variable B = XMM7. |
| // (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7) |
| |
| // Assumptions |
| // 1. size_in_bytes is a multiple of 16. |
| // 2. srcmem and dstmem are 16 byte aligned. |
| // 3. size_in_bytes is less than 2^19 bytes. |
| |
| // Assumption 3 ensures that there is no overflow when numbers are being |
| // added (we can remove this assumption by doing modulus with a prime |
| // number when it is just about to overflow but that would be a very costly |
| // exercise) |
| |
| // Returns true if the checksums are equal. |
| bool AdlerChecksum::Equals(const AdlerChecksum &other) const { |
| return ( (a1_ == other.a1_) && (a2_ == other.a2_) && |
| (b1_ == other.b1_) && (b2_ == other.b2_) ); |
| } |
| |
| // Returns string representation of the Adler checksum. |
| string AdlerChecksum::ToHexString() const { |
| char buffer[128]; |
| snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_); |
| return string(buffer); |
| } |
| |
| // Sets components of the Adler checksum. |
| void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) { |
| a1_ = a1; |
| a2_ = a2; |
| b1_ = b1; |
| b2_ = b2; |
| } |
| |
| // Calculates Adler checksum for supplied data. |
| bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes, |
| AdlerChecksum *checksum) { |
| // Use this data wrapper to access memory with 64bit read/write. |
| datacast_t data; |
| unsigned int count = size_in_bytes / sizeof(data); |
| |
| if (count > (1U) << 19) { |
| // Size is too large, must be strictly less than 512 KB. |
| return false; |
| } |
| |
| uint64 a1 = 1; |
| uint64 a2 = 1; |
| uint64 b1 = 0; |
| uint64 b2 = 0; |
| |
| unsigned int i = 0; |
| while (i < count) { |
| // Process 64 bits at a time. |
| data.l64 = data64[i]; |
| a1 = a1 + data.l32.l; |
| b1 = b1 + a1; |
| a1 = a1 + data.l32.h; |
| b1 = b1 + a1; |
| i++; |
| |
| data.l64 = data64[i]; |
| a2 = a2 + data.l32.l; |
| b2 = b2 + a2; |
| a2 = a2 + data.l32.h; |
| b2 = b2 + a2; |
| i++; |
| } |
| checksum->Set(a1, a2, b1, b2); |
| return true; |
| } |
| |
| // C implementation of Adler memory copy. |
| bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64, |
| unsigned int size_in_bytes, AdlerChecksum *checksum) { |
| // Use this data wrapper to access memory with 64bit read/write. |
| datacast_t data; |
| unsigned int count = size_in_bytes / sizeof(data); |
| |
| if (count > ((1U) << 19)) { |
| // Size is too large, must be strictly less than 512 KB. |
| return false; |
| } |
| |
| uint64 a1 = 1; |
| uint64 a2 = 1; |
| uint64 b1 = 0; |
| uint64 b2 = 0; |
| |
| unsigned int i = 0; |
| while (i < count) { |
| // Process 64 bits at a time. |
| data.l64 = srcmem64[i]; |
| a1 = a1 + data.l32.l; |
| b1 = b1 + a1; |
| a1 = a1 + data.l32.h; |
| b1 = b1 + a1; |
| dstmem64[i] = data.l64; |
| i++; |
| |
| data.l64 = srcmem64[i]; |
| a2 = a2 + data.l32.l; |
| b2 = b2 + a2; |
| a2 = a2 + data.l32.h; |
| b2 = b2 + a2; |
| dstmem64[i] = data.l64; |
| i++; |
| } |
| checksum->Set(a1, a2, b1, b2); |
| return true; |
| } |
| |
| // C implementation of Adler memory copy with some float point ops, |
| // attempting to warm up the CPU. |
| bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64, |
| unsigned int size_in_bytes, AdlerChecksum *checksum) { |
| // Use this data wrapper to access memory with 64bit read/write. |
| datacast_t data; |
| unsigned int count = size_in_bytes / sizeof(data); |
| |
| if (count > ((1U) << 19)) { |
| // Size is too large, must be strictly less than 512 KB. |
| return false; |
| } |
| |
| uint64 a1 = 1; |
| uint64 a2 = 1; |
| uint64 b1 = 0; |
| uint64 b2 = 0; |
| |
| double a = 2.0 * static_cast<double>(srcmem64[0]); |
| double b = 5.0 * static_cast<double>(srcmem64[0]); |
| double c = 7.0 * static_cast<double>(srcmem64[0]); |
| double d = 9.0 * static_cast<double>(srcmem64[0]); |
| |
| unsigned int i = 0; |
| while (i < count) { |
| // Process 64 bits at a time. |
| data.l64 = srcmem64[i]; |
| a1 = a1 + data.l32.l; |
| b1 = b1 + a1; |
| a1 = a1 + data.l32.h; |
| b1 = b1 + a1; |
| dstmem64[i] = data.l64; |
| i++; |
| |
| // Warm cpu up. |
| a = a * b; |
| b = b + c; |
| |
| data.l64 = srcmem64[i]; |
| a2 = a2 + data.l32.l; |
| b2 = b2 + a2; |
| a2 = a2 + data.l32.h; |
| b2 = b2 + a2; |
| dstmem64[i] = data.l64; |
| i++; |
| |
| // Warm cpu up. |
| c = c * d; |
| d = d + d; |
| } |
| |
| // Warm cpu up. |
| d = a + b + c + d; |
| if (d == 1.0) { |
| // Reference the result so that it can't be discarded by the compiler. |
| printf("Log: This will probably never happen.\n"); |
| } |
| |
| checksum->Set(a1, a2, b1, b2); |
| return true; |
| } |
| |
| // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy. |
| bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, |
| unsigned int size_in_bytes, AdlerChecksum *checksum) { |
| // Use assembly implementation where supported. |
| #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686) |
| |
| // Pull a bit of tricky preprocessing to make the inline asm both |
| // 32 bit and 64 bit. |
| #ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86... |
| #define rAX "%%eax" |
| #define rCX "%%ecx" |
| #define rDX "%%edx" |
| #define rBX "%%ebx" |
| #define rSP "%%esp" |
| #define rBP "%%ebp" |
| #define rSI "%%esi" |
| #define rDI "%%edi" |
| #endif |
| |
| #ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros. |
| #define rAX "%%rax" |
| #define rCX "%%rcx" |
| #define rDX "%%rdx" |
| #define rBX "%%rbx" |
| #define rSP "%%rsp" |
| #define rBP "%%rbp" |
| #define rSI "%%rsi" |
| #define rDI "%%rdi" |
| #endif |
| |
| // Elements 0 to 3 are used for holding checksum terms a1, a2, |
| // b1, b2 respectively. These elements are filled by asm code. |
| // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing |
| // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are |
| // setting word index 0 and word index 2 to zero). |
| // Element 6 and 7 are used for setting a1 and a2 to 1. |
| volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) = |
| {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1}; |
| |
| if ((size_in_bytes >> 19) > 0) { |
| // Size is too large. Must be less than 2^19 bytes = 512 KB. |
| return false; |
| } |
| |
| // Number of 32-bit words which are not added to a1/a2 in the main loop. |
| uint32 remaining_words = (size_in_bytes % 48) / 4; |
| |
| // Since we are moving 48 bytes at a time number of iterations = total size/48 |
| // is value of counter. |
| uint32 num_of_48_byte_units = size_in_bytes / 48; |
| |
| asm volatile ( |
| // Source address is in ESI (extended source index) |
| // destination is in EDI (extended destination index) |
| // and counter is already in ECX (extended counter |
| // index). |
| "cmp $0, " rCX ";" // Compare counter to zero. |
| "jz END;" |
| |
| // XMM6 is initialized with 1 and XMM7 with 0. |
| "prefetchnta 0(" rSI ");" |
| "prefetchnta 64(" rSI ");" |
| "movdqu 48(" rAX "), %%xmm6;" |
| "xorps %%xmm7, %%xmm7;" |
| |
| // Start of the loop which copies 48 bytes from source to dst each time. |
| "TOP:\n" |
| |
| // Make 6 moves each of 16 bytes from srcmem to XMM registers. |
| // We are using 2 words out of 4 words in each XMM register, |
| // word index 0 and word index 2 |
| "movdqa 0(" rSI "), %%xmm0;" |
| "movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here. |
| "movdqa 16(" rSI "), %%xmm2;" |
| "movdqu 20(" rSI "), %%xmm3;" |
| "movdqa 32(" rSI "), %%xmm4;" |
| "movdqu 36(" rSI "), %%xmm5;" |
| |
| // Move 3 * 16 bytes from XMM registers to dstmem. |
| // Note: this copy must be performed before pinsrw instructions since |
| // they will modify the XMM registers. |
| "movntdq %%xmm0, 0(" rDI ");" |
| "movntdq %%xmm2, 16(" rDI ");" |
| "movntdq %%xmm4, 32(" rDI ");" |
| |
| // Sets the word[1] and word[3] of XMM0 to XMM5 to zero. |
| "andps 32(" rAX "), %%xmm0;" |
| "andps 32(" rAX "), %%xmm1;" |
| "andps 32(" rAX "), %%xmm2;" |
| "andps 32(" rAX "), %%xmm3;" |
| "andps 32(" rAX "), %%xmm4;" |
| "andps 32(" rAX "), %%xmm5;" |
| |
| // Add XMM0 to XMM6 and then add XMM6 to XMM7. |
| // Repeat this for XMM1, ..., XMM5. |
| // Overflow(for XMM7) can occur only if there are more |
| // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so |
| // if size_in_bytes > 2^19 than overflow occurs. |
| "paddq %%xmm0, %%xmm6;" |
| "paddq %%xmm6, %%xmm7;" |
| "paddq %%xmm1, %%xmm6;" |
| "paddq %%xmm6, %%xmm7;" |
| "paddq %%xmm2, %%xmm6;" |
| "paddq %%xmm6, %%xmm7;" |
| "paddq %%xmm3, %%xmm6;" |
| "paddq %%xmm6, %%xmm7;" |
| "paddq %%xmm4, %%xmm6;" |
| "paddq %%xmm6, %%xmm7;" |
| "paddq %%xmm5, %%xmm6;" |
| "paddq %%xmm6, %%xmm7;" |
| |
| // Increment ESI and EDI by 48 bytes and decrement counter by 1. |
| "add $48, " rSI ";" |
| "add $48, " rDI ";" |
| "prefetchnta 0(" rSI ");" |
| "prefetchnta 64(" rSI ");" |
| "dec " rCX ";" |
| "jnz TOP;" |
| |
| // Now only remaining_words 32-bit words are left. |
| // make a loop, add first two words to a1 and next two to a2 (just like |
| // above loop, the only extra thing we are doing is rechecking |
| // rDX (=remaining_words) everytime we add a number to a1/a2. |
| "REM_IS_STILL_NOT_ZERO:\n" |
| // Unless remaining_words becomes less than 4 words(16 bytes) |
| // there is not much issue and remaining_words will always |
| // be a multiple of four by assumption. |
| "cmp $4, " rDX ";" |
| // In case for some weird reasons if remaining_words becomes |
| // less than 4 but not zero then also break the code and go off to END. |
| "jl END;" |
| // Otherwise just go on and copy data in chunks of 4-words at a time till |
| // whole data (<48 bytes) is copied. |
| "movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1. |
| |
| "movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without |
| "pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary. |
| |
| "movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination. |
| "andps 32(" rAX "), %%xmm0;" |
| "andps 32(" rAX "), %%xmm1;" |
| "paddq %%xmm0, %%xmm6;" |
| "paddq %%xmm6, %%xmm7;" |
| "paddq %%xmm1, %%xmm6;" |
| "paddq %%xmm6, %%xmm7;" |
| "add $16, " rSI ";" |
| "add $16, " rDI ";" |
| "sub $4, " rDX ";" |
| // Decrement %rDX by 4 since %rDX is number of 32-bit |
| // words left after considering all 48-byte units. |
| "jmp REM_IS_STILL_NOT_ZERO;" |
| |
| "END:\n" |
| // Report checksum values A and B (both right now are two concatenated |
| // 64 bit numbers and have to be converted to 64 bit numbers) |
| // seems like Adler128 (since size of each part is 4 byte rather than |
| // 1 byte). |
| "movdqa %%xmm6, 0(" rAX ");" |
| "movdqa %%xmm7, 16(" rAX ");" |
| "sfence;" |
| |
| // No output registers. |
| : |
| // Input registers. |
| : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr), |
| "c" (num_of_48_byte_units), "d" (remaining_words) |
| ); // asm. |
| |
| if (checksum != NULL) { |
| checksum->Set(checksum_arr[0], checksum_arr[1], |
| checksum_arr[2], checksum_arr[3]); |
| } |
| |
| // Everything went fine, so return true (this does not mean |
| // that there is no problem with memory this just mean that data was copied |
| // from src to dst and checksum was calculated successfully). |
| return true; |
| #elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__) |
| // Elements 0 to 3 are used for holding checksum terms a1, a2, |
| // b1, b2 respectively. These elements are filled by asm code. |
| // Checksum is seeded with the null checksum. |
| volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) = |
| {1, 1, 0, 0}; |
| |
| if ((size_in_bytes >> 19) > 0) { |
| // Size is too large. Must be less than 2^19 bytes = 512 KB. |
| return false; |
| } |
| |
| // Since we are moving 64 bytes at a time number of iterations = total size/64 |
| uint32 blocks = size_in_bytes / 64; |
| |
| uint64 *dst = dstmem64; |
| uint64 *src = srcmem64; |
| |
| #define src_r "r3" |
| #define dst_r "r4" |
| #define blocks_r "r5" |
| #define crc_r "r6" |
| |
| asm volatile ( |
| "mov " src_r ", %[src]; \n" |
| "mov " dst_r ", %[dst]; \n" |
| "mov " crc_r ", %[crc]; \n" |
| "mov " blocks_r ", %[blocks]; \n" |
| |
| // Loop over block count. |
| "cmp " blocks_r ", #0; \n" // Compare counter to zero. |
| "ble END; \n" |
| |
| |
| // Preload upcoming cacheline. |
| "pld [" src_r ", #0x0]; \n" |
| "pld [" src_r ", #0x20]; \n" |
| |
| // Init checksum |
| "vldm " crc_r ", {q0}; \n" |
| "vmov.i32 q1, #0; \n" |
| |
| // Start of the loop which copies 48 bytes from source to dst each time. |
| "TOP: \n" |
| |
| // Make 3 moves each of 16 bytes from srcmem to qX registers. |
| // We are using 2 words out of 4 words in each qX register, |
| // word index 0 and word index 2. We'll swizzle them in a bit. |
| // Copy it. |
| "vldm " src_r "!, {q8, q9, q10, q11}; \n" |
| "vstm " dst_r "!, {q8, q9, q10, q11}; \n" |
| |
| // Arrange it. |
| "vmov.i64 q12, #0; \n" |
| "vmov.i64 q13, #0; \n" |
| "vmov.i64 q14, #0; \n" |
| "vmov.i64 q15, #0; \n" |
| // This exchenges words 1,3 in the filled registers with |
| // words 0,2 in the empty registers. |
| "vtrn.32 q8, q12; \n" |
| "vtrn.32 q9, q13; \n" |
| "vtrn.32 q10, q14; \n" |
| "vtrn.32 q11, q15; \n" |
| |
| // Sum into q0, then into q1. |
| // Repeat this for q8 - q13. |
| // Overflow can occur only if there are more |
| // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so |
| // if size_in_bytes > 2^19 than overflow occurs. |
| "vadd.i64 q0, q0, q8; \n" |
| "vadd.i64 q1, q1, q0; \n" |
| "vadd.i64 q0, q0, q12; \n" |
| "vadd.i64 q1, q1, q0; \n" |
| "vadd.i64 q0, q0, q9; \n" |
| "vadd.i64 q1, q1, q0; \n" |
| "vadd.i64 q0, q0, q13; \n" |
| "vadd.i64 q1, q1, q0; \n" |
| |
| "vadd.i64 q0, q0, q10; \n" |
| "vadd.i64 q1, q1, q0; \n" |
| "vadd.i64 q0, q0, q14; \n" |
| "vadd.i64 q1, q1, q0; \n" |
| "vadd.i64 q0, q0, q11; \n" |
| "vadd.i64 q1, q1, q0; \n" |
| "vadd.i64 q0, q0, q15; \n" |
| "vadd.i64 q1, q1, q0; \n" |
| |
| // Increment counter and loop. |
| "sub " blocks_r ", " blocks_r ", #1; \n" |
| "cmp " blocks_r ", #0; \n" // Compare counter to zero. |
| "bgt TOP; \n" |
| |
| |
| "END:\n" |
| // Report checksum values A and B (both right now are two concatenated |
| // 64 bit numbers and have to be converted to 64 bit numbers) |
| // seems like Adler128 (since size of each part is 4 byte rather than |
| // 1 byte). |
| "vstm " crc_r ", {q0, q1}; \n" |
| |
| // Output registers. |
| : |
| // Input registers. |
| : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr) |
| : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15" |
| ); // asm. |
| |
| if (checksum != NULL) { |
| checksum->Set(checksum_arr[0], checksum_arr[1], |
| checksum_arr[2], checksum_arr[3]); |
| } |
| |
| // Everything went fine, so return true (this does not mean |
| // that there is no problem with memory this just mean that data was copied |
| // from src to dst and checksum was calculated successfully). |
| return true; |
| #else |
| #warning "No vector copy defined for this architecture." |
| // Fall back to C implementation for anything else. |
| return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum); |
| #endif |
| } |