1 /* Byte-wise substring search, using the Two-Way algorithm.
2  * Copyright (C) 2008, 2010 Eric Blake
3  * Permission to use, copy, modify, and distribute this software
4  * is freely granted, provided that this notice is preserved.
5  */
6 
7 
8 /* Before including this file, you need to include <string.h>, and define:
9      RESULT_TYPE		A macro that expands to the return type.
10      AVAILABLE(h, h_l, j, n_l)	A macro that returns nonzero if there are
11 				at least N_L bytes left starting at
12 				H[J].  H is 'unsigned char *', H_L, J,
13 				and N_L are 'size_t'; H_L is an
14 				lvalue.  For NUL-terminated searches,
15 				H_L can be modified each iteration to
16 				avoid having to compute the end of H
17 				up front.
18 
19   For case-insensitivity, you may optionally define:
20      CMP_FUNC(p1, p2, l)	A macro that returns 0 iff the first L
21 				characters of P1 and P2 are equal.
22      CANON_ELEMENT(c)		A macro that canonicalizes an element
23 				right after it has been fetched from
24 				one of the two strings.  The argument
25 				is an 'unsigned char'; the result must
26 				be an 'unsigned char' as well.
27 
28   This file undefines the macros documented above, and defines
29   LONG_NEEDLE_THRESHOLD.
30 */
31 
32 #include <limits.h>
33 #include <stdint.h>
34 
35 extern void * _memchr(const void * src_void , int c , size_t length);
36 extern int _memcmp(const void * m1 , const void * m2 , size_t n);
37 
38 /* We use the Two-Way string matching algorithm, which guarantees
39    linear complexity with constant space.  Additionally, for long
40    needles, we also use a bad character shift table similar to the
41    Boyer-Moore algorithm to achieve improved (potentially sub-linear)
42    performance.
43 
44    See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
45    and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
46 */
47 
48 /* Point at which computing a bad-byte shift table is likely to be
49    worthwhile.  Small needles should not compute a table, since it
50    adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
51    speedup no greater than a factor of NEEDLE_LEN.  The larger the
52    needle, the better the potential performance gain.  On the other
53    hand, on non-POSIX systems with CHAR_BIT larger than eight, the
54    memory required for the table is prohibitive.  */
55 #if CHAR_BIT < 10
56 # define LONG_NEEDLE_THRESHOLD 32U
57 #else
58 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
59 #endif
60 
61 #define MAX(a, b) ((a < b) ? (b) : (a))
62 
63 #ifndef CANON_ELEMENT
64 # define CANON_ELEMENT(c) c
65 #endif
66 #ifndef CMP_FUNC
67 # define CMP_FUNC _memcmp
68 #endif
69 
70 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
71    Return the index of the first byte in the right half, and set
72    *PERIOD to the global period of the right half.
73 
74    The global period of a string is the smallest index (possibly its
75    length) at which all remaining bytes in the string are repetitions
76    of the prefix (the last repetition may be a subset of the prefix).
77 
78    When NEEDLE is factored into two halves, a local period is the
79    length of the smallest word that shares a suffix with the left half
80    and shares a prefix with the right half.  All factorizations of a
81    non-empty NEEDLE have a local period of at least 1 and no greater
82    than NEEDLE_LEN.
83 
84    A critical factorization has the property that the local period
85    equals the global period.  All strings have at least one critical
86    factorization with the left half smaller than the global period.
87 
88    Given an ordered alphabet, a critical factorization can be computed
89    in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
90    larger of two ordered maximal suffixes.  The ordered maximal
91    suffixes are determined by lexicographic comparison of
92    periodicity.  */
93 LIBC_ROM_TEXT_SECTION
94 _LONG_CALL_
95 static size_t
__rtl_critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)96 __rtl_critical_factorization (const unsigned char *needle, size_t needle_len,
97 			size_t *period)
98 {
99   /* Index of last byte of left half, or SIZE_MAX.  */
100   size_t max_suffix, max_suffix_rev;
101   size_t j; /* Index into NEEDLE for current candidate suffix.  */
102   size_t k; /* Offset into current period.  */
103   size_t p; /* Intermediate period.  */
104   unsigned char a, b; /* Current comparison bytes.  */
105 
106   /* Invariants:
107      0 <= j < NEEDLE_LEN - 1
108      -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
109      min(max_suffix, max_suffix_rev) < global period of NEEDLE
110      1 <= p <= global period of NEEDLE
111      p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
112      1 <= k <= p
113   */
114 
115   /* Perform lexicographic search.  */
116   max_suffix = SIZE_MAX;
117   j = 0;
118   k = p = 1;
119   while (j + k < needle_len)
120     {
121       a = CANON_ELEMENT (needle[j + k]);
122       b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
123       if (a < b)
124 	{
125 	  /* Suffix is smaller, period is entire prefix so far.  */
126 	  j += k;
127 	  k = 1;
128 	  p = j - max_suffix;
129 	}
130       else if (a == b)
131 	{
132 	  /* Advance through repetition of the current period.  */
133 	  if (k != p)
134 	    ++k;
135 	  else
136 	    {
137 	      j += p;
138 	      k = 1;
139 	    }
140 	}
141       else /* b < a */
142 	{
143 	  /* Suffix is larger, start over from current location.  */
144 	  max_suffix = j++;
145 	  k = p = 1;
146 	}
147     }
148   *period = p;
149 
150   /* Perform reverse lexicographic search.  */
151   max_suffix_rev = SIZE_MAX;
152   j = 0;
153   k = p = 1;
154   while (j + k < needle_len)
155     {
156       a = CANON_ELEMENT (needle[j + k]);
157       b = CANON_ELEMENT (needle[max_suffix_rev + k]);
158       if (b < a)
159 	{
160 	  /* Suffix is smaller, period is entire prefix so far.  */
161 	  j += k;
162 	  k = 1;
163 	  p = j - max_suffix_rev;
164 	}
165       else if (a == b)
166 	{
167 	  /* Advance through repetition of the current period.  */
168 	  if (k != p)
169 	    ++k;
170 	  else
171 	    {
172 	      j += p;
173 	      k = 1;
174 	    }
175 	}
176       else /* a < b */
177 	{
178 	  /* Suffix is larger, start over from current location.  */
179 	  max_suffix_rev = j++;
180 	  k = p = 1;
181 	}
182     }
183 
184   /* Choose the longer suffix.  Return the first byte of the right
185      half, rather than the last byte of the left half.  */
186   if (max_suffix_rev + 1 < max_suffix + 1)
187     return max_suffix + 1;
188   *period = p;
189   return max_suffix_rev + 1;
190 }
191 
192 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
193    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
194    method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
195    Performance is guaranteed to be linear, with an initialization cost
196    of 2 * NEEDLE_LEN comparisons.
197 
198    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
199    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
200    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
201    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
202 LIBC_ROM_TEXT_SECTION
203 _LONG_CALL_
204 static RETURN_TYPE
__rtl_two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)205 __rtl_two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
206 		      const unsigned char *needle, size_t needle_len)
207 {
208   size_t i; /* Index into current byte of NEEDLE.  */
209   size_t j; /* Index into current window of HAYSTACK.  */
210   size_t period; /* The period of the right half of needle.  */
211   size_t suffix; /* The index of the right half of needle.  */
212 
213   /* Factor the needle into two halves, such that the left half is
214      smaller than the global period, and the right half is
215      periodic (with a period as large as NEEDLE_LEN - suffix).  */
216   suffix = __rtl_critical_factorization (needle, needle_len, &period);
217 
218   /* Perform the search.  Each iteration compares the right half
219      first.  */
220   if (CMP_FUNC (needle, needle + period, suffix) == 0)
221     {
222       /* Entire needle is periodic; a mismatch can only advance by the
223 	 period, so use memory to avoid rescanning known occurrences
224 	 of the period.  */
225       size_t memory = 0;
226       j = 0;
227       while (AVAILABLE (haystack, haystack_len, j, needle_len))
228 	{
229 	  /* Scan for matches in right half.  */
230 	  i = MAX (suffix, memory);
231 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
232 				    == CANON_ELEMENT (haystack[i + j])))
233 	    ++i;
234 	  if (needle_len <= i)
235 	    {
236 	      /* Scan for matches in left half.  */
237 	      i = suffix - 1;
238 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
239 					== CANON_ELEMENT (haystack[i + j])))
240 		--i;
241 	      if (i + 1 < memory + 1)
242 		return (RETURN_TYPE) (haystack + j);
243 	      /* No match, so remember how many repetitions of period
244 		 on the right half were scanned.  */
245 	      j += period;
246 	      memory = needle_len - period;
247 	    }
248 	  else
249 	    {
250 	      j += i - suffix + 1;
251 	      memory = 0;
252 	    }
253 	}
254     }
255   else
256     {
257       /* The two halves of needle are distinct; no extra memory is
258 	 required, and any mismatch results in a maximal shift.  */
259       period = MAX (suffix, needle_len - suffix) + 1;
260       j = 0;
261       while (AVAILABLE (haystack, haystack_len, j, needle_len))
262 	{
263 	  /* Scan for matches in right half.  */
264 	  i = suffix;
265 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
266 				    == CANON_ELEMENT (haystack[i + j])))
267 	    ++i;
268 	  if (needle_len <= i)
269 	    {
270 	      /* Scan for matches in left half.  */
271 	      i = suffix - 1;
272 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
273 				       == CANON_ELEMENT (haystack[i + j])))
274 		--i;
275 	      if (i == SIZE_MAX)
276 		return (RETURN_TYPE) (haystack + j);
277 	      j += period;
278 	    }
279 	  else
280 	    j += i - suffix + 1;
281 	}
282     }
283   return NULL;
284 }
285 
286 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
287    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
288    method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
289    Performance is guaranteed to be linear, with an initialization cost
290    of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
291 
292    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
293    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
294    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
295    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
296    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
297    sublinear performance is not possible.  */
298 LIBC_ROM_TEXT_SECTION
299 _LONG_CALL_
300 static RETURN_TYPE
__rtl_two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)301 __rtl_two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
302 		     const unsigned char *needle, size_t needle_len)
303 {
304   size_t i; /* Index into current byte of NEEDLE.  */
305   size_t j; /* Index into current window of HAYSTACK.  */
306   size_t period; /* The period of the right half of needle.  */
307   size_t suffix; /* The index of the right half of needle.  */
308   size_t shift_table[1U << CHAR_BIT]; /* See below.  */
309 
310   /* Factor the needle into two halves, such that the left half is
311      smaller than the global period, and the right half is
312      periodic (with a period as large as NEEDLE_LEN - suffix).  */
313   suffix = __rtl_critical_factorization (needle, needle_len, &period);
314 
315   /* Populate shift_table.  For each possible byte value c,
316      shift_table[c] is the distance from the last occurrence of c to
317      the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
318      shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
319   for (i = 0; i < 1U << CHAR_BIT; i++)
320     shift_table[i] = needle_len;
321   for (i = 0; i < needle_len; i++)
322     shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
323 
324   /* Perform the search.  Each iteration compares the right half
325      first.  */
326   if (CMP_FUNC (needle, needle + period, suffix) == 0)
327     {
328       /* Entire needle is periodic; a mismatch can only advance by the
329 	 period, so use memory to avoid rescanning known occurrences
330 	 of the period.  */
331       size_t memory = 0;
332       size_t shift;
333       j = 0;
334       while (AVAILABLE (haystack, haystack_len, j, needle_len))
335 	{
336 	  /* Check the last byte first; if it does not match, then
337 	     shift to the next possible match location.  */
338 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
339 	  if (0 < shift)
340 	    {
341 	      if (memory && shift < period)
342 		{
343 		  /* Since needle is periodic, but the last period has
344 		     a byte out of place, there can be no match until
345 		     after the mismatch.  */
346 		  shift = needle_len - period;
347 		}
348 	      memory = 0;
349 	      j += shift;
350 	      continue;
351 	    }
352 	  /* Scan for matches in right half.  The last byte has
353 	     already been matched, by virtue of the shift table.  */
354 	  i = MAX (suffix, memory);
355 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
356 					== CANON_ELEMENT (haystack[i + j])))
357 	    ++i;
358 	  if (needle_len - 1 <= i)
359 	    {
360 	      /* Scan for matches in left half.  */
361 	      i = suffix - 1;
362 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
363 					== CANON_ELEMENT (haystack[i + j])))
364 		--i;
365 	      if (i + 1 < memory + 1)
366 		return (RETURN_TYPE) (haystack + j);
367 	      /* No match, so remember how many repetitions of period
368 		 on the right half were scanned.  */
369 	      j += period;
370 	      memory = needle_len - period;
371 	    }
372 	  else
373 	    {
374 	      j += i - suffix + 1;
375 	      memory = 0;
376 	    }
377 	}
378     }
379   else
380     {
381       /* The two halves of needle are distinct; no extra memory is
382 	 required, and any mismatch results in a maximal shift.  */
383       size_t shift;
384       period = MAX (suffix, needle_len - suffix) + 1;
385       j = 0;
386       while (AVAILABLE (haystack, haystack_len, j, needle_len))
387 	{
388 	  /* Check the last byte first; if it does not match, then
389 	     shift to the next possible match location.  */
390 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
391 	  if (0 < shift)
392 	    {
393 	      j += shift;
394 	      continue;
395 	    }
396 	  /* Scan for matches in right half.  The last byte has
397 	     already been matched, by virtue of the shift table.  */
398 	  i = suffix;
399 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
400 					== CANON_ELEMENT (haystack[i + j])))
401 	    ++i;
402 	  if (needle_len - 1 <= i)
403 	    {
404 	      /* Scan for matches in left half.  */
405 	      i = suffix - 1;
406 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
407 				       == CANON_ELEMENT (haystack[i + j])))
408 		--i;
409 	      if (i == SIZE_MAX)
410 		return (RETURN_TYPE) (haystack + j);
411 	      j += period;
412 	    }
413 	  else
414 	    j += i - suffix + 1;
415 	}
416     }
417   return NULL;
418 }
419 
420 #undef AVAILABLE
421 #undef CANON_ELEMENT
422 #undef CMP_FUNC
423 #undef MAX
424 #undef RETURN_TYPE
425