| // This code is unused. PCMPESTRI is gratuitously slow. I imagine it might |
| // start winning with a hypothetical memchr4 (or greater). This technique might |
| // also be good for exposing searches over ranges of bytes, but that departs |
| // from the standard memchr API, so it's not clear whether we actually want |
| // that or not. |
| // |
| // N.B. PCMPISTRI appears to be about twice as fast as PCMPESTRI, which is kind |
| // of neat. Unfortunately, UTF-8 strings can contain NUL bytes, which means |
| // I don't see a way of effectively using PCMPISTRI unless there's some fast |
| // way to replace zero bytes with a byte that is not not a needle byte. |
| |
| use core::arch::x86_64::*; |
| use core::mem::size_of; |
| |
| use x86::sse2; |
| |
| const VECTOR_SIZE: usize = size_of::<__m128i>(); |
| const CONTROL_ANY: i32 = |
| _SIDD_UBYTE_OPS |
| | _SIDD_CMP_EQUAL_ANY |
| | _SIDD_POSITIVE_POLARITY |
| | _SIDD_LEAST_SIGNIFICANT; |
| |
| #[target_feature(enable = "sse4.2")] |
| pub unsafe fn memchr3( |
| n1: u8, n2: u8, n3: u8, |
| haystack: &[u8] |
| ) -> Option<usize> { |
| let vn1 = _mm_set1_epi8(n1 as i8); |
| let vn2 = _mm_set1_epi8(n2 as i8); |
| let vn3 = _mm_set1_epi8(n3 as i8); |
| let vn = _mm_setr_epi8( |
| n1 as i8, n2 as i8, n3 as i8, 0, |
| 0, 0, 0, 0, |
| 0, 0, 0, 0, |
| 0, 0, 0, 0, |
| ); |
| let len = haystack.len(); |
| let start_ptr = haystack.as_ptr(); |
| let end_ptr = haystack[haystack.len()..].as_ptr(); |
| let mut ptr = start_ptr; |
| |
| if haystack.len() < VECTOR_SIZE { |
| while ptr < end_ptr { |
| if *ptr == n1 || *ptr == n2 || *ptr == n3 { |
| return Some(sub(ptr, start_ptr)); |
| } |
| ptr = ptr.offset(1); |
| } |
| return None; |
| } |
| while ptr <= end_ptr.sub(VECTOR_SIZE) { |
| let chunk = _mm_loadu_si128(ptr as *const __m128i); |
| let res = _mm_cmpestri(vn, 3, chunk, 16, CONTROL_ANY); |
| if res < 16 { |
| return Some(sub(ptr, start_ptr) + res as usize); |
| } |
| ptr = ptr.add(VECTOR_SIZE); |
| } |
| if ptr < end_ptr { |
| debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); |
| ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); |
| debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); |
| |
| return sse2::forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); |
| } |
| None |
| } |
| |
| /// Subtract `b` from `a` and return the difference. `a` should be greater than |
| /// or equal to `b`. |
| fn sub(a: *const u8, b: *const u8) -> usize { |
| debug_assert!(a >= b); |
| (a as usize) - (b as usize) |
| } |