001 /*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.base;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021
022 import com.google.common.annotations.Beta;
023 import com.google.common.annotations.GwtCompatible;
024
025 import java.util.ArrayList;
026 import java.util.Arrays;
027 import java.util.List;
028
029 import javax.annotation.CheckReturnValue;
030
031 /**
032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
033 * for any {@link Object}. Also offers basic text processing methods based on this function.
034 * Implementations are strongly encouraged to be side-effect-free and immutable.
035 *
036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
038 *
039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
042 * treats these just as two separate characters.
043 *
044 * <p>Example usages: <pre>
045 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
046 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
047 *
048 * @author Kevin Bourrillion
049 * @since 1.0
050 */
051 @Beta // Possibly change from chars to code points; decide constants vs. methods
052 @GwtCompatible
053 public abstract class CharMatcher implements Predicate<Character> {
054 // Constants
055
056 // Excludes 2000-2000a, which is handled as a range
057 private static final String BREAKING_WHITESPACE_CHARS =
058 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000";
059
060 // Excludes 2007, which is handled as a gap in a pair of ranges
061 private static final String NON_BREAKING_WHITESPACE_CHARS =
062 "\u00a0\u180e\u202f";
063
064 /**
065 * Determines whether a character is whitespace according to the latest Unicode standard, as
066 * illustrated
067 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
068 * This is not the same definition used by other Java APIs. (See a
069 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
070 * definitions of "whitespace"</a>.)
071 *
072 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
073 * to date.
074 */
075 public static final CharMatcher WHITESPACE =
076 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS)
077 .or(inRange('\u2000', '\u200a'))
078 .precomputed();
079
080 /**
081 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
082 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
083 * discussion of that term.
084 *
085 * @since 2.0
086 */
087 public static final CharMatcher BREAKING_WHITESPACE =
088 anyOf(BREAKING_WHITESPACE_CHARS)
089 .or(inRange('\u2000', '\u2006'))
090 .or(inRange('\u2008', '\u200a'))
091 .precomputed();
092
093 /**
094 * Determines whether a character is ASCII, meaning that its code point is less than 128.
095 */
096 public static final CharMatcher ASCII = inRange('\0', '\u007f');
097
098 /**
099 * Determines whether a character is a digit according to
100 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
101 */
102 public static final CharMatcher DIGIT;
103
104 static {
105 CharMatcher digit = inRange('0', '9');
106 String zeroes =
107 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
108 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
109 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
110 for (char base : zeroes.toCharArray()) {
111 digit = digit.or(inRange(base, (char) (base + 9)));
112 }
113 DIGIT = digit.precomputed();
114 }
115
116 /**
117 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
118 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
119 */
120 public static final CharMatcher JAVA_DIGIT = new CharMatcher() {
121 @Override public boolean matches(char c) {
122 return Character.isDigit(c);
123 }
124 };
125
126 /**
127 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
128 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
129 * inRange('a', 'z').or(inRange('A', 'Z'))}.
130 */
131 public static final CharMatcher JAVA_LETTER = new CharMatcher() {
132 @Override public boolean matches(char c) {
133 return Character.isLetter(c);
134 }
135 };
136
137 /**
138 * Determines whether a character is a letter or digit according to {@link
139 * Character#isLetterOrDigit(char) Java's definition}.
140 */
141 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() {
142 @Override public boolean matches(char c) {
143 return Character.isLetterOrDigit(c);
144 }
145 };
146
147 /**
148 * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
149 * Java's definition}.
150 */
151 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() {
152 @Override public boolean matches(char c) {
153 return Character.isUpperCase(c);
154 }
155 };
156
157 /**
158 * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
159 * Java's definition}.
160 */
161 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() {
162 @Override public boolean matches(char c) {
163 return Character.isLowerCase(c);
164 }
165 };
166
167 /**
168 * Determines whether a character is an ISO control character as specified by {@link
169 * Character#isISOControl(char)}.
170 */
171 public static final CharMatcher JAVA_ISO_CONTROL =
172 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f'));
173
174 /**
175 * Determines whether a character is invisible; that is, if its Unicode category is any of
176 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
177 * PRIVATE_USE according to ICU4J.
178 */
179 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
180 .or(inRange('\u007f', '\u00a0'))
181 .or(is('\u00ad'))
182 .or(inRange('\u0600', '\u0603'))
183 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e"))
184 .or(inRange('\u2000', '\u200f'))
185 .or(inRange('\u2028', '\u202f'))
186 .or(inRange('\u205f', '\u2064'))
187 .or(inRange('\u206a', '\u206f'))
188 .or(is('\u3000'))
189 .or(inRange('\ud800', '\uf8ff'))
190 .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
191 .precomputed();
192
193 /**
194 * Determines whether a character is single-width (not double-width). When in doubt, this matcher
195 * errs on the side of returning {@code false} (that is, it tends to assume a character is
196 * double-width).
197 *
198 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
199 * date.
200 */
201 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
202 .or(is('\u05be'))
203 .or(inRange('\u05d0', '\u05ea'))
204 .or(is('\u05f3'))
205 .or(is('\u05f4'))
206 .or(inRange('\u0600', '\u06ff'))
207 .or(inRange('\u0750', '\u077f'))
208 .or(inRange('\u0e00', '\u0e7f'))
209 .or(inRange('\u1e00', '\u20af'))
210 .or(inRange('\u2100', '\u213a'))
211 .or(inRange('\ufb50', '\ufdff'))
212 .or(inRange('\ufe70', '\ufeff'))
213 .or(inRange('\uff61', '\uffdc'))
214 .precomputed();
215
216 /** Matches any character. */
217 public static final CharMatcher ANY =
218 new CharMatcher() {
219 @Override public boolean matches(char c) {
220 return true;
221 }
222
223 @Override public int indexIn(CharSequence sequence) {
224 return (sequence.length() == 0) ? -1 : 0;
225 }
226
227 @Override public int indexIn(CharSequence sequence, int start) {
228 int length = sequence.length();
229 Preconditions.checkPositionIndex(start, length);
230 return (start == length) ? -1 : start;
231 }
232
233 @Override public int lastIndexIn(CharSequence sequence) {
234 return sequence.length() - 1;
235 }
236
237 @Override public boolean matchesAllOf(CharSequence sequence) {
238 checkNotNull(sequence);
239 return true;
240 }
241
242 @Override public boolean matchesNoneOf(CharSequence sequence) {
243 return sequence.length() == 0;
244 }
245
246 @Override public String removeFrom(CharSequence sequence) {
247 checkNotNull(sequence);
248 return "";
249 }
250
251 @Override public String replaceFrom(CharSequence sequence, char replacement) {
252 char[] array = new char[sequence.length()];
253 Arrays.fill(array, replacement);
254 return new String(array);
255 }
256
257 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
258 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
259 for (int i = 0; i < sequence.length(); i++) {
260 retval.append(replacement);
261 }
262 return retval.toString();
263 }
264
265 @Override public String collapseFrom(CharSequence sequence, char replacement) {
266 return (sequence.length() == 0) ? "" : String.valueOf(replacement);
267 }
268
269 @Override public String trimFrom(CharSequence sequence) {
270 checkNotNull(sequence);
271 return "";
272 }
273
274 @Override public int countIn(CharSequence sequence) {
275 return sequence.length();
276 }
277
278 @Override public CharMatcher and(CharMatcher other) {
279 return checkNotNull(other);
280 }
281
282 @Override public CharMatcher or(CharMatcher other) {
283 checkNotNull(other);
284 return this;
285 }
286
287 @Override public CharMatcher negate() {
288 return NONE;
289 }
290
291 @Override public CharMatcher precomputed() {
292 return this;
293 }
294 };
295
296 /** Matches no characters. */
297 public static final CharMatcher NONE =
298 new CharMatcher() {
299 @Override public boolean matches(char c) {
300 return false;
301 }
302
303 @Override public int indexIn(CharSequence sequence) {
304 checkNotNull(sequence);
305 return -1;
306 }
307
308 @Override public int indexIn(CharSequence sequence, int start) {
309 int length = sequence.length();
310 Preconditions.checkPositionIndex(start, length);
311 return -1;
312 }
313
314 @Override public int lastIndexIn(CharSequence sequence) {
315 checkNotNull(sequence);
316 return -1;
317 }
318
319 @Override public boolean matchesAllOf(CharSequence sequence) {
320 return sequence.length() == 0;
321 }
322
323 @Override public boolean matchesNoneOf(CharSequence sequence) {
324 checkNotNull(sequence);
325 return true;
326 }
327
328 @Override public String removeFrom(CharSequence sequence) {
329 return sequence.toString();
330 }
331
332 @Override public String replaceFrom(CharSequence sequence, char replacement) {
333 return sequence.toString();
334 }
335
336 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
337 checkNotNull(replacement);
338 return sequence.toString();
339 }
340
341 @Override public String collapseFrom(CharSequence sequence, char replacement) {
342 return sequence.toString();
343 }
344
345 @Override public String trimFrom(CharSequence sequence) {
346 return sequence.toString();
347 }
348
349 @Override public int countIn(CharSequence sequence) {
350 checkNotNull(sequence);
351 return 0;
352 }
353
354 @Override public CharMatcher and(CharMatcher other) {
355 checkNotNull(other);
356 return this;
357 }
358
359 @Override public CharMatcher or(CharMatcher other) {
360 return checkNotNull(other);
361 }
362
363 @Override public CharMatcher negate() {
364 return ANY;
365 }
366
367 @Override void setBits(LookupTable table) {}
368
369 @Override public CharMatcher precomputed() {
370 return this;
371 }
372 };
373
374 // Static factories
375
376 /**
377 * Returns a {@code char} matcher that matches only one specified character.
378 */
379 public static CharMatcher is(final char match) {
380 return new CharMatcher() {
381 @Override public boolean matches(char c) {
382 return c == match;
383 }
384
385 @Override public String replaceFrom(CharSequence sequence, char replacement) {
386 return sequence.toString().replace(match, replacement);
387 }
388
389 @Override public CharMatcher and(CharMatcher other) {
390 return other.matches(match) ? this : NONE;
391 }
392
393 @Override public CharMatcher or(CharMatcher other) {
394 return other.matches(match) ? other : super.or(other);
395 }
396
397 @Override public CharMatcher negate() {
398 return isNot(match);
399 }
400
401 @Override void setBits(LookupTable table) {
402 table.set(match);
403 }
404
405 @Override public CharMatcher precomputed() {
406 return this;
407 }
408 };
409 }
410
411 /**
412 * Returns a {@code char} matcher that matches any character except the one specified.
413 *
414 * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
415 */
416 public static CharMatcher isNot(final char match) {
417 return new CharMatcher() {
418 @Override public boolean matches(char c) {
419 return c != match;
420 }
421
422 @Override public CharMatcher and(CharMatcher other) {
423 return other.matches(match) ? super.and(other) : other;
424 }
425
426 @Override public CharMatcher or(CharMatcher other) {
427 return other.matches(match) ? ANY : this;
428 }
429
430 @Override public CharMatcher negate() {
431 return is(match);
432 }
433 };
434 }
435
436 /**
437 * Returns a {@code char} matcher that matches any character present in the given character
438 * sequence.
439 */
440 public static CharMatcher anyOf(final CharSequence sequence) {
441 switch (sequence.length()) {
442 case 0:
443 return NONE;
444 case 1:
445 return is(sequence.charAt(0));
446 case 2:
447 final char match1 = sequence.charAt(0);
448 final char match2 = sequence.charAt(1);
449 return new CharMatcher() {
450 @Override public boolean matches(char c) {
451 return c == match1 || c == match2;
452 }
453
454 @Override void setBits(LookupTable table) {
455 table.set(match1);
456 table.set(match2);
457 }
458
459 @Override public CharMatcher precomputed() {
460 return this;
461 }
462 };
463 }
464
465 final char[] chars = sequence.toString().toCharArray();
466 Arrays.sort(chars); // not worth collapsing duplicates
467
468 return new CharMatcher() {
469 @Override public boolean matches(char c) {
470 return Arrays.binarySearch(chars, c) >= 0;
471 }
472
473 @Override void setBits(LookupTable table) {
474 for (char c : chars) {
475 table.set(c);
476 }
477 }
478 };
479 }
480
481 /**
482 * Returns a {@code char} matcher that matches any character not present in the given character
483 * sequence.
484 */
485 public static CharMatcher noneOf(CharSequence sequence) {
486 return anyOf(sequence).negate();
487 }
488
489 /**
490 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
491 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
492 * CharMatcher.inRange('a', 'z')}.
493 *
494 * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
495 */
496 public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
497 checkArgument(endInclusive >= startInclusive);
498 return new CharMatcher() {
499 @Override public boolean matches(char c) {
500 return startInclusive <= c && c <= endInclusive;
501 }
502
503 @Override void setBits(LookupTable table) {
504 char c = startInclusive;
505 while (true) {
506 table.set(c);
507 if (c++ == endInclusive) {
508 break;
509 }
510 }
511 }
512
513 @Override public CharMatcher precomputed() {
514 return this;
515 }
516 };
517 }
518
519 /**
520 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
521 * which operates on primitive {@code char} instances instead.
522 */
523 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
524 checkNotNull(predicate);
525 if (predicate instanceof CharMatcher) {
526 return (CharMatcher) predicate;
527 }
528 return new CharMatcher() {
529 @Override public boolean matches(char c) {
530 return predicate.apply(c);
531 }
532
533 @Override public boolean apply(Character character) {
534 return predicate.apply(checkNotNull(character));
535 }
536 };
537 }
538
539 // Constructors
540
541 /**
542 * Constructor for use by subclasses.
543 */
544 protected CharMatcher() {}
545
546 // Abstract methods
547
548 /** Determines a true or false value for the given character. */
549 public abstract boolean matches(char c);
550
551 // Non-static factories
552
553 /**
554 * Returns a matcher that matches any character not matched by this matcher.
555 */
556 public CharMatcher negate() {
557 final CharMatcher original = this;
558 return new CharMatcher() {
559 @Override public boolean matches(char c) {
560 return !original.matches(c);
561 }
562
563 @Override public boolean matchesAllOf(CharSequence sequence) {
564 return original.matchesNoneOf(sequence);
565 }
566
567 @Override public boolean matchesNoneOf(CharSequence sequence) {
568 return original.matchesAllOf(sequence);
569 }
570
571 @Override public int countIn(CharSequence sequence) {
572 return sequence.length() - original.countIn(sequence);
573 }
574
575 @Override public CharMatcher negate() {
576 return original;
577 }
578 };
579 }
580
581 /**
582 * Returns a matcher that matches any character matched by both this matcher and {@code other}.
583 */
584 public CharMatcher and(CharMatcher other) {
585 return new And(Arrays.asList(this, checkNotNull(other)));
586 }
587
588 private static class And extends CharMatcher {
589 List<CharMatcher> components;
590
591 And(List<CharMatcher> components) {
592 this.components = components; // Skip defensive copy (private)
593 }
594
595 @Override public boolean matches(char c) {
596 for (CharMatcher matcher : components) {
597 if (!matcher.matches(c)) {
598 return false;
599 }
600 }
601 return true;
602 }
603
604 @Override public CharMatcher and(CharMatcher other) {
605 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
606 newComponents.add(checkNotNull(other));
607 return new And(newComponents);
608 }
609 }
610
611 /**
612 * Returns a matcher that matches any character matched by either this matcher or {@code other}.
613 */
614 public CharMatcher or(CharMatcher other) {
615 return new Or(Arrays.asList(this, checkNotNull(other)));
616 }
617
618 private static class Or extends CharMatcher {
619 List<CharMatcher> components;
620
621 Or(List<CharMatcher> components) {
622 this.components = components; // Skip defensive copy (private)
623 }
624
625 @Override public boolean matches(char c) {
626 for (CharMatcher matcher : components) {
627 if (matcher.matches(c)) {
628 return true;
629 }
630 }
631 return false;
632 }
633
634 @Override public CharMatcher or(CharMatcher other) {
635 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
636 newComponents.add(checkNotNull(other));
637 return new Or(newComponents);
638 }
639
640 @Override void setBits(LookupTable table) {
641 for (CharMatcher matcher : components) {
642 matcher.setBits(table);
643 }
644 }
645 }
646
647 /**
648 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
649 * query than the original; your mileage may vary. Precomputation takes time and is likely to be
650 * worthwhile only if the precomputed matcher is queried many thousands of times.
651 *
652 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
653 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
654 * worthwhile tradeoff in a browser.
655 */
656 public CharMatcher precomputed() {
657 return Platform.precomputeCharMatcher(this);
658 }
659
660 /**
661 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
662 * on {@link Platform} so that we can have different behavior in GWT.
663 *
664 * <p>The default precomputation is to cache the configuration of the original matcher in an
665 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query
666 * than the original.
667 *
668 * <p>The default implementation creates a new bit array and passes it to {@link
669 * #setBits(LookupTable)}.
670 */
671 CharMatcher precomputedInternal() {
672 final LookupTable table = new LookupTable();
673 setBits(table);
674
675 return new CharMatcher() {
676 @Override public boolean matches(char c) {
677 return table.get(c);
678 }
679
680 // TODO(kevinb): make methods like negate() smart?
681
682 @Override public CharMatcher precomputed() {
683 return this;
684 }
685 };
686 }
687
688 /**
689 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
690 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
691 *
692 * <p>The default implementation loops over every possible character value, invoking {@link
693 * #matches} for each one.
694 */
695 void setBits(LookupTable table) {
696 char c = Character.MIN_VALUE;
697 while (true) {
698 if (matches(c)) {
699 table.set(c);
700 }
701 if (c++ == Character.MAX_VALUE) {
702 break;
703 }
704 }
705 }
706
707 /**
708 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
709 *
710 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
711 * simpler java.util.BitSet.
712 */
713 private static final class LookupTable {
714 int[] data = new int[2048];
715
716 void set(char index) {
717 data[index >> 5] |= (1 << index);
718 }
719
720 boolean get(char index) {
721 return (data[index >> 5] & (1 << index)) != 0;
722 }
723 }
724
725 // Text processing routines
726
727 /**
728 * Returns {@code true} if a character sequence contains at least one matching character.
729 * Equivalent to {@code !matchesNoneOf(sequence)}.
730 *
731 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
732 * character, until this returns {@code true} or the end is reached.
733 *
734 * @param sequence the character sequence to examine, possibly empty
735 * @return {@code true} if this matcher matches at least one character in the sequence
736 * @since 8.0
737 */
738 public boolean matchesAnyOf(CharSequence sequence) {
739 return !matchesNoneOf(sequence);
740 }
741
742 /**
743 * Returns {@code true} if a character sequence contains only matching characters.
744 *
745 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
746 * character, until this returns {@code false} or the end is reached.
747 *
748 * @param sequence the character sequence to examine, possibly empty
749 * @return {@code true} if this matcher matches every character in the sequence, including when
750 * the sequence is empty
751 */
752 public boolean matchesAllOf(CharSequence sequence) {
753 for (int i = sequence.length() - 1; i >= 0; i--) {
754 if (!matches(sequence.charAt(i))) {
755 return false;
756 }
757 }
758 return true;
759 }
760
761 /**
762 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
763 * {@code !matchesAnyOf(sequence)}.
764 *
765 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
766 * character, until this returns {@code false} or the end is reached.
767 *
768 * @param sequence the character sequence to examine, possibly empty
769 * @return {@code true} if this matcher matches every character in the sequence, including when
770 * the sequence is empty
771 */
772 public boolean matchesNoneOf(CharSequence sequence) {
773 return indexIn(sequence) == -1;
774 }
775
776 // TODO(kevinb): add matchesAnyOf()
777
778 /**
779 * Returns the index of the first matching character in a character sequence, or {@code -1} if no
780 * matching character is present.
781 *
782 * <p>The default implementation iterates over the sequence in forward order calling {@link
783 * #matches} for each character.
784 *
785 * @param sequence the character sequence to examine from the beginning
786 * @return an index, or {@code -1} if no character matches
787 */
788 public int indexIn(CharSequence sequence) {
789 int length = sequence.length();
790 for (int i = 0; i < length; i++) {
791 if (matches(sequence.charAt(i))) {
792 return i;
793 }
794 }
795 return -1;
796 }
797
798 /**
799 * Returns the index of the first matching character in a character sequence, starting from a
800 * given position, or {@code -1} if no character matches after that position.
801 *
802 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
803 * start}, calling {@link #matches} for each character.
804 *
805 * @param sequence the character sequence to examine
806 * @param start the first index to examine; must be nonnegative and no greater than {@code
807 * sequence.length()}
808 * @return the index of the first matching character, guaranteed to be no less than {@code start},
809 * or {@code -1} if no character matches
810 * @throws IndexOutOfBoundsException if start is negative or greater than {@code
811 * sequence.length()}
812 */
813 public int indexIn(CharSequence sequence, int start) {
814 int length = sequence.length();
815 Preconditions.checkPositionIndex(start, length);
816 for (int i = start; i < length; i++) {
817 if (matches(sequence.charAt(i))) {
818 return i;
819 }
820 }
821 return -1;
822 }
823
824 /**
825 * Returns the index of the last matching character in a character sequence, or {@code -1} if no
826 * matching character is present.
827 *
828 * <p>The default implementation iterates over the sequence in reverse order calling {@link
829 * #matches} for each character.
830 *
831 * @param sequence the character sequence to examine from the end
832 * @return an index, or {@code -1} if no character matches
833 */
834 public int lastIndexIn(CharSequence sequence) {
835 for (int i = sequence.length() - 1; i >= 0; i--) {
836 if (matches(sequence.charAt(i))) {
837 return i;
838 }
839 }
840 return -1;
841 }
842
843 /**
844 * Returns the number of matching characters found in a character sequence.
845 */
846 public int countIn(CharSequence sequence) {
847 int count = 0;
848 for (int i = 0; i < sequence.length(); i++) {
849 if (matches(sequence.charAt(i))) {
850 count++;
851 }
852 }
853 return count;
854 }
855
856 /**
857 * Returns a string containing all non-matching characters of a character sequence, in order. For
858 * example: <pre> {@code
859 *
860 * CharMatcher.is('a').removeFrom("bazaar")}</pre>
861 *
862 * ... returns {@code "bzr"}.
863 */
864 @CheckReturnValue
865 public String removeFrom(CharSequence sequence) {
866 String string = sequence.toString();
867 int pos = indexIn(string);
868 if (pos == -1) {
869 return string;
870 }
871
872 char[] chars = string.toCharArray();
873 int spread = 1;
874
875 // This unusual loop comes from extensive benchmarking
876 OUT: while (true) {
877 pos++;
878 while (true) {
879 if (pos == chars.length) {
880 break OUT;
881 }
882 if (matches(chars[pos])) {
883 break;
884 }
885 chars[pos - spread] = chars[pos];
886 pos++;
887 }
888 spread++;
889 }
890 return new String(chars, 0, pos - spread);
891 }
892
893 /**
894 * Returns a string containing all matching characters of a character sequence, in order. For
895 * example: <pre> {@code
896 *
897 * CharMatcher.is('a').retainFrom("bazaar")}</pre>
898 *
899 * ... returns {@code "aaa"}.
900 */
901 @CheckReturnValue
902 public String retainFrom(CharSequence sequence) {
903 return negate().removeFrom(sequence);
904 }
905
906 /**
907 * Returns a string copy of the input character sequence, with each character that matches this
908 * matcher replaced by a given replacement character. For example: <pre> {@code
909 *
910 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
911 *
912 * ... returns {@code "rodor"}.
913 *
914 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
915 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
916 * character.
917 *
918 * @param sequence the character sequence to replace matching characters in
919 * @param replacement the character to append to the result string in place of each matching
920 * character in {@code sequence}
921 * @return the new string
922 */
923 @CheckReturnValue
924 public String replaceFrom(CharSequence sequence, char replacement) {
925 String string = sequence.toString();
926 int pos = indexIn(string);
927 if (pos == -1) {
928 return string;
929 }
930 char[] chars = string.toCharArray();
931 chars[pos] = replacement;
932 for (int i = pos + 1; i < chars.length; i++) {
933 if (matches(chars[i])) {
934 chars[i] = replacement;
935 }
936 }
937 return new String(chars);
938 }
939
940 /**
941 * Returns a string copy of the input character sequence, with each character that matches this
942 * matcher replaced by a given replacement sequence. For example: <pre> {@code
943 *
944 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
945 *
946 * ... returns {@code "yoohoo"}.
947 *
948 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
949 * off calling {@link #replaceFrom(CharSequence, char)} directly.
950 *
951 * @param sequence the character sequence to replace matching characters in
952 * @param replacement the characters to append to the result string in place of each matching
953 * character in {@code sequence}
954 * @return the new string
955 */
956 @CheckReturnValue
957 public String replaceFrom(CharSequence sequence, CharSequence replacement) {
958 int replacementLen = replacement.length();
959 if (replacementLen == 0) {
960 return removeFrom(sequence);
961 }
962 if (replacementLen == 1) {
963 return replaceFrom(sequence, replacement.charAt(0));
964 }
965
966 String string = sequence.toString();
967 int pos = indexIn(string);
968 if (pos == -1) {
969 return string;
970 }
971
972 int len = string.length();
973 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
974
975 int oldpos = 0;
976 do {
977 buf.append(string, oldpos, pos);
978 buf.append(replacement);
979 oldpos = pos + 1;
980 pos = indexIn(string, oldpos);
981 } while (pos != -1);
982
983 buf.append(string, oldpos, len);
984 return buf.toString();
985 }
986
987 /**
988 * Returns a substring of the input character sequence that omits all characters this matcher
989 * matches from the beginning and from the end of the string. For example: <pre> {@code
990 *
991 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
992 *
993 * ... returns {@code "cat"}.
994 *
995 * <p>Note that: <pre> {@code
996 *
997 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
998 *
999 * ... is equivalent to {@link String#trim()}.
1000 */
1001 @CheckReturnValue
1002 public String trimFrom(CharSequence sequence) {
1003 int len = sequence.length();
1004 int first;
1005 int last;
1006
1007 for (first = 0; first < len; first++) {
1008 if (!matches(sequence.charAt(first))) {
1009 break;
1010 }
1011 }
1012 for (last = len - 1; last > first; last--) {
1013 if (!matches(sequence.charAt(last))) {
1014 break;
1015 }
1016 }
1017
1018 return sequence.subSequence(first, last + 1).toString();
1019 }
1020
1021 /**
1022 * Returns a substring of the input character sequence that omits all characters this matcher
1023 * matches from the beginning of the string. For example: <pre> {@code
1024 *
1025 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1026 *
1027 * ... returns {@code "catbab"}.
1028 */
1029 @CheckReturnValue
1030 public String trimLeadingFrom(CharSequence sequence) {
1031 int len = sequence.length();
1032 int first;
1033
1034 for (first = 0; first < len; first++) {
1035 if (!matches(sequence.charAt(first))) {
1036 break;
1037 }
1038 }
1039
1040 return sequence.subSequence(first, len).toString();
1041 }
1042
1043 /**
1044 * Returns a substring of the input character sequence that omits all characters this matcher
1045 * matches from the end of the string. For example: <pre> {@code
1046 *
1047 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1048 *
1049 * ... returns {@code "abacat"}.
1050 */
1051 @CheckReturnValue
1052 public String trimTrailingFrom(CharSequence sequence) {
1053 int len = sequence.length();
1054 int last;
1055
1056 for (last = len - 1; last >= 0; last--) {
1057 if (!matches(sequence.charAt(last))) {
1058 break;
1059 }
1060 }
1061
1062 return sequence.subSequence(0, last + 1).toString();
1063 }
1064
1065 /**
1066 * Returns a string copy of the input character sequence, with each group of consecutive
1067 * characters that match this matcher replaced by a single replacement character. For example:
1068 * <pre> {@code
1069 *
1070 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1071 *
1072 * ... returns {@code "b-p-r"}.
1073 *
1074 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1075 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1076 * character.
1077 *
1078 * @param sequence the character sequence to replace matching groups of characters in
1079 * @param replacement the character to append to the result string in place of each group of
1080 * matching characters in {@code sequence}
1081 * @return the new string
1082 */
1083 @CheckReturnValue
1084 public String collapseFrom(CharSequence sequence, char replacement) {
1085 int first = indexIn(sequence);
1086 if (first == -1) {
1087 return sequence.toString();
1088 }
1089
1090 // TODO(kevinb): see if this implementation can be made faster
1091 StringBuilder builder = new StringBuilder(sequence.length())
1092 .append(sequence.subSequence(0, first))
1093 .append(replacement);
1094 boolean in = true;
1095 for (int i = first + 1; i < sequence.length(); i++) {
1096 char c = sequence.charAt(i);
1097 if (apply(c)) {
1098 if (!in) {
1099 builder.append(replacement);
1100 in = true;
1101 }
1102 } else {
1103 builder.append(c);
1104 in = false;
1105 }
1106 }
1107 return builder.toString();
1108 }
1109
1110 /**
1111 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1112 * groups of matching characters at the start or end of the sequence are removed without
1113 * replacement.
1114 */
1115 @CheckReturnValue
1116 public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1117 int first = negate().indexIn(sequence);
1118 if (first == -1) {
1119 return ""; // everything matches. nothing's left.
1120 }
1121 StringBuilder builder = new StringBuilder(sequence.length());
1122 boolean inMatchingGroup = false;
1123 for (int i = first; i < sequence.length(); i++) {
1124 char c = sequence.charAt(i);
1125 if (apply(c)) {
1126 inMatchingGroup = true;
1127 } else {
1128 if (inMatchingGroup) {
1129 builder.append(replacement);
1130 inMatchingGroup = false;
1131 }
1132 builder.append(c);
1133 }
1134 }
1135 return builder.toString();
1136 }
1137
1138 // Predicate interface
1139
1140 /**
1141 * Returns {@code true} if this matcher matches the given character.
1142 *
1143 * @throws NullPointerException if {@code character} is null
1144 */
1145 @Override public boolean apply(Character character) {
1146 return matches(character);
1147 }
1148 }