1 /* Byte-wise substring search, using the Two-Way algorithm.
2 * Copyright (C) 2008, 2010 Eric Blake
3 * Permission to use, copy, modify, and distribute this software
4 * is freely granted, provided that this notice is preserved.
5 */
6
7
8 /* Before including this file, you need to include <string.h>, and define:
9 RESULT_TYPE A macro that expands to the return type.
10 AVAILABLE(h, h_l, j, n_l) A macro that returns nonzero if there are
11 at least N_L bytes left starting at
12 H[J]. H is 'unsigned char *', H_L, J,
13 and N_L are 'size_t'; H_L is an
14 lvalue. For NUL-terminated searches,
15 H_L can be modified each iteration to
16 avoid having to compute the end of H
17 up front.
18
19 For case-insensitivity, you may optionally define:
20 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
21 characters of P1 and P2 are equal.
22 CANON_ELEMENT(c) A macro that canonicalizes an element
23 right after it has been fetched from
24 one of the two strings. The argument
25 is an 'unsigned char'; the result must
26 be an 'unsigned char' as well.
27
28 This file undefines the macros documented above, and defines
29 LONG_NEEDLE_THRESHOLD.
30 */
31
32 #include <limits.h>
33 #include <stdint.h>
34
35 extern void * _memchr(const void * src_void , int c , size_t length);
36 extern int _memcmp(const void * m1 , const void * m2 , size_t n);
37
38 /* We use the Two-Way string matching algorithm, which guarantees
39 linear complexity with constant space. Additionally, for long
40 needles, we also use a bad character shift table similar to the
41 Boyer-Moore algorithm to achieve improved (potentially sub-linear)
42 performance.
43
44 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
45 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
46 */
47
48 /* Point at which computing a bad-byte shift table is likely to be
49 worthwhile. Small needles should not compute a table, since it
50 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
51 speedup no greater than a factor of NEEDLE_LEN. The larger the
52 needle, the better the potential performance gain. On the other
53 hand, on non-POSIX systems with CHAR_BIT larger than eight, the
54 memory required for the table is prohibitive. */
55 #if CHAR_BIT < 10
56 # define LONG_NEEDLE_THRESHOLD 32U
57 #else
58 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
59 #endif
60
61 #define MAX(a, b) ((a < b) ? (b) : (a))
62
63 #ifndef CANON_ELEMENT
64 # define CANON_ELEMENT(c) c
65 #endif
66 #ifndef CMP_FUNC
67 # define CMP_FUNC _memcmp
68 #endif
69
70 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
71 Return the index of the first byte in the right half, and set
72 *PERIOD to the global period of the right half.
73
74 The global period of a string is the smallest index (possibly its
75 length) at which all remaining bytes in the string are repetitions
76 of the prefix (the last repetition may be a subset of the prefix).
77
78 When NEEDLE is factored into two halves, a local period is the
79 length of the smallest word that shares a suffix with the left half
80 and shares a prefix with the right half. All factorizations of a
81 non-empty NEEDLE have a local period of at least 1 and no greater
82 than NEEDLE_LEN.
83
84 A critical factorization has the property that the local period
85 equals the global period. All strings have at least one critical
86 factorization with the left half smaller than the global period.
87
88 Given an ordered alphabet, a critical factorization can be computed
89 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
90 larger of two ordered maximal suffixes. The ordered maximal
91 suffixes are determined by lexicographic comparison of
92 periodicity. */
93 LIBC_ROM_TEXT_SECTION
94 _LONG_CALL_
95 static size_t
__rtl_critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)96 __rtl_critical_factorization (const unsigned char *needle, size_t needle_len,
97 size_t *period)
98 {
99 /* Index of last byte of left half, or SIZE_MAX. */
100 size_t max_suffix, max_suffix_rev;
101 size_t j; /* Index into NEEDLE for current candidate suffix. */
102 size_t k; /* Offset into current period. */
103 size_t p; /* Intermediate period. */
104 unsigned char a, b; /* Current comparison bytes. */
105
106 /* Invariants:
107 0 <= j < NEEDLE_LEN - 1
108 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
109 min(max_suffix, max_suffix_rev) < global period of NEEDLE
110 1 <= p <= global period of NEEDLE
111 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
112 1 <= k <= p
113 */
114
115 /* Perform lexicographic search. */
116 max_suffix = SIZE_MAX;
117 j = 0;
118 k = p = 1;
119 while (j + k < needle_len)
120 {
121 a = CANON_ELEMENT (needle[j + k]);
122 b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
123 if (a < b)
124 {
125 /* Suffix is smaller, period is entire prefix so far. */
126 j += k;
127 k = 1;
128 p = j - max_suffix;
129 }
130 else if (a == b)
131 {
132 /* Advance through repetition of the current period. */
133 if (k != p)
134 ++k;
135 else
136 {
137 j += p;
138 k = 1;
139 }
140 }
141 else /* b < a */
142 {
143 /* Suffix is larger, start over from current location. */
144 max_suffix = j++;
145 k = p = 1;
146 }
147 }
148 *period = p;
149
150 /* Perform reverse lexicographic search. */
151 max_suffix_rev = SIZE_MAX;
152 j = 0;
153 k = p = 1;
154 while (j + k < needle_len)
155 {
156 a = CANON_ELEMENT (needle[j + k]);
157 b = CANON_ELEMENT (needle[max_suffix_rev + k]);
158 if (b < a)
159 {
160 /* Suffix is smaller, period is entire prefix so far. */
161 j += k;
162 k = 1;
163 p = j - max_suffix_rev;
164 }
165 else if (a == b)
166 {
167 /* Advance through repetition of the current period. */
168 if (k != p)
169 ++k;
170 else
171 {
172 j += p;
173 k = 1;
174 }
175 }
176 else /* a < b */
177 {
178 /* Suffix is larger, start over from current location. */
179 max_suffix_rev = j++;
180 k = p = 1;
181 }
182 }
183
184 /* Choose the longer suffix. Return the first byte of the right
185 half, rather than the last byte of the left half. */
186 if (max_suffix_rev + 1 < max_suffix + 1)
187 return max_suffix + 1;
188 *period = p;
189 return max_suffix_rev + 1;
190 }
191
192 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
193 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
194 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
195 Performance is guaranteed to be linear, with an initialization cost
196 of 2 * NEEDLE_LEN comparisons.
197
198 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
199 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
200 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
201 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
202 LIBC_ROM_TEXT_SECTION
203 _LONG_CALL_
204 static RETURN_TYPE
__rtl_two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)205 __rtl_two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
206 const unsigned char *needle, size_t needle_len)
207 {
208 size_t i; /* Index into current byte of NEEDLE. */
209 size_t j; /* Index into current window of HAYSTACK. */
210 size_t period; /* The period of the right half of needle. */
211 size_t suffix; /* The index of the right half of needle. */
212
213 /* Factor the needle into two halves, such that the left half is
214 smaller than the global period, and the right half is
215 periodic (with a period as large as NEEDLE_LEN - suffix). */
216 suffix = __rtl_critical_factorization (needle, needle_len, &period);
217
218 /* Perform the search. Each iteration compares the right half
219 first. */
220 if (CMP_FUNC (needle, needle + period, suffix) == 0)
221 {
222 /* Entire needle is periodic; a mismatch can only advance by the
223 period, so use memory to avoid rescanning known occurrences
224 of the period. */
225 size_t memory = 0;
226 j = 0;
227 while (AVAILABLE (haystack, haystack_len, j, needle_len))
228 {
229 /* Scan for matches in right half. */
230 i = MAX (suffix, memory);
231 while (i < needle_len && (CANON_ELEMENT (needle[i])
232 == CANON_ELEMENT (haystack[i + j])))
233 ++i;
234 if (needle_len <= i)
235 {
236 /* Scan for matches in left half. */
237 i = suffix - 1;
238 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
239 == CANON_ELEMENT (haystack[i + j])))
240 --i;
241 if (i + 1 < memory + 1)
242 return (RETURN_TYPE) (haystack + j);
243 /* No match, so remember how many repetitions of period
244 on the right half were scanned. */
245 j += period;
246 memory = needle_len - period;
247 }
248 else
249 {
250 j += i - suffix + 1;
251 memory = 0;
252 }
253 }
254 }
255 else
256 {
257 /* The two halves of needle are distinct; no extra memory is
258 required, and any mismatch results in a maximal shift. */
259 period = MAX (suffix, needle_len - suffix) + 1;
260 j = 0;
261 while (AVAILABLE (haystack, haystack_len, j, needle_len))
262 {
263 /* Scan for matches in right half. */
264 i = suffix;
265 while (i < needle_len && (CANON_ELEMENT (needle[i])
266 == CANON_ELEMENT (haystack[i + j])))
267 ++i;
268 if (needle_len <= i)
269 {
270 /* Scan for matches in left half. */
271 i = suffix - 1;
272 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
273 == CANON_ELEMENT (haystack[i + j])))
274 --i;
275 if (i == SIZE_MAX)
276 return (RETURN_TYPE) (haystack + j);
277 j += period;
278 }
279 else
280 j += i - suffix + 1;
281 }
282 }
283 return NULL;
284 }
285
286 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
287 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
288 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
289 Performance is guaranteed to be linear, with an initialization cost
290 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
291
292 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
293 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
294 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
295 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
296 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
297 sublinear performance is not possible. */
298 LIBC_ROM_TEXT_SECTION
299 _LONG_CALL_
300 static RETURN_TYPE
__rtl_two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)301 __rtl_two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
302 const unsigned char *needle, size_t needle_len)
303 {
304 size_t i; /* Index into current byte of NEEDLE. */
305 size_t j; /* Index into current window of HAYSTACK. */
306 size_t period; /* The period of the right half of needle. */
307 size_t suffix; /* The index of the right half of needle. */
308 size_t shift_table[1U << CHAR_BIT]; /* See below. */
309
310 /* Factor the needle into two halves, such that the left half is
311 smaller than the global period, and the right half is
312 periodic (with a period as large as NEEDLE_LEN - suffix). */
313 suffix = __rtl_critical_factorization (needle, needle_len, &period);
314
315 /* Populate shift_table. For each possible byte value c,
316 shift_table[c] is the distance from the last occurrence of c to
317 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
318 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
319 for (i = 0; i < 1U << CHAR_BIT; i++)
320 shift_table[i] = needle_len;
321 for (i = 0; i < needle_len; i++)
322 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
323
324 /* Perform the search. Each iteration compares the right half
325 first. */
326 if (CMP_FUNC (needle, needle + period, suffix) == 0)
327 {
328 /* Entire needle is periodic; a mismatch can only advance by the
329 period, so use memory to avoid rescanning known occurrences
330 of the period. */
331 size_t memory = 0;
332 size_t shift;
333 j = 0;
334 while (AVAILABLE (haystack, haystack_len, j, needle_len))
335 {
336 /* Check the last byte first; if it does not match, then
337 shift to the next possible match location. */
338 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
339 if (0 < shift)
340 {
341 if (memory && shift < period)
342 {
343 /* Since needle is periodic, but the last period has
344 a byte out of place, there can be no match until
345 after the mismatch. */
346 shift = needle_len - period;
347 }
348 memory = 0;
349 j += shift;
350 continue;
351 }
352 /* Scan for matches in right half. The last byte has
353 already been matched, by virtue of the shift table. */
354 i = MAX (suffix, memory);
355 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
356 == CANON_ELEMENT (haystack[i + j])))
357 ++i;
358 if (needle_len - 1 <= i)
359 {
360 /* Scan for matches in left half. */
361 i = suffix - 1;
362 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
363 == CANON_ELEMENT (haystack[i + j])))
364 --i;
365 if (i + 1 < memory + 1)
366 return (RETURN_TYPE) (haystack + j);
367 /* No match, so remember how many repetitions of period
368 on the right half were scanned. */
369 j += period;
370 memory = needle_len - period;
371 }
372 else
373 {
374 j += i - suffix + 1;
375 memory = 0;
376 }
377 }
378 }
379 else
380 {
381 /* The two halves of needle are distinct; no extra memory is
382 required, and any mismatch results in a maximal shift. */
383 size_t shift;
384 period = MAX (suffix, needle_len - suffix) + 1;
385 j = 0;
386 while (AVAILABLE (haystack, haystack_len, j, needle_len))
387 {
388 /* Check the last byte first; if it does not match, then
389 shift to the next possible match location. */
390 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
391 if (0 < shift)
392 {
393 j += shift;
394 continue;
395 }
396 /* Scan for matches in right half. The last byte has
397 already been matched, by virtue of the shift table. */
398 i = suffix;
399 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
400 == CANON_ELEMENT (haystack[i + j])))
401 ++i;
402 if (needle_len - 1 <= i)
403 {
404 /* Scan for matches in left half. */
405 i = suffix - 1;
406 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
407 == CANON_ELEMENT (haystack[i + j])))
408 --i;
409 if (i == SIZE_MAX)
410 return (RETURN_TYPE) (haystack + j);
411 j += period;
412 }
413 else
414 j += i - suffix + 1;
415 }
416 }
417 return NULL;
418 }
419
420 #undef AVAILABLE
421 #undef CANON_ELEMENT
422 #undef CMP_FUNC
423 #undef MAX
424 #undef RETURN_TYPE
425