diff options
author | Sven Göthel <[email protected]> | 2024-02-04 07:05:40 +0100 |
---|---|---|
committer | Sven Göthel <[email protected]> | 2024-02-04 07:05:40 +0100 |
commit | 36f595b37220976125222f7f5b3dbfc74df163e0 (patch) | |
tree | 43cf879e24d3865e16b5d3ac1fd19bd21097bc82 /src/java/com/jogamp | |
parent | a67d19d005de669c0c4b240745d5359921acc90a (diff) |
Adding StringUtil, containing basic utility functions as well as unicode space (whitespace) and full- and halfwidth tests.
Diffstat (limited to 'src/java/com/jogamp')
-rw-r--r-- | src/java/com/jogamp/common/util/StringUtil.java | 327 |
1 files changed, 327 insertions, 0 deletions
diff --git a/src/java/com/jogamp/common/util/StringUtil.java b/src/java/com/jogamp/common/util/StringUtil.java new file mode 100644 index 0000000..4084a03 --- /dev/null +++ b/src/java/com/jogamp/common/util/StringUtil.java @@ -0,0 +1,327 @@ +/** + * Copyright 2014 JogAmp Community. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are + * permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY JogAmp Community ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JogAmp Community OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those of the + * authors and should not be interpreted as representing official policies, either expressed + * or implied, of JogAmp Community. + */ +package com.jogamp.common.util; + +import java.util.ArrayList; +import java.util.List; + +/** Basic utility functions for {@link String} and {@link CharSequence} in general. */ +public class StringUtil { + /** Linefeed character unicode {@code '\n'}, 0x000A. */ + public static final char LF = '\n'; + /** CR character unicode {@code '\r'}, 0x000D. */ + public static final char CR = '\r'; + /** Space character unicode {@code ' '}, 0x0020. */ + public static final char SPACE = ' '; + + /** + * List of ASCII & Unicode space separator, aka {@code Whitespace}. + * @see https://www.compart.com/en/unicode/category/Zs + * @see https://en.wikipedia.org/wiki/Whitespace_character + * @see https://www.unicode.org/reports/tr44/#General_Category_Values + * @see ftp://ftp.unicode.org/Public/UNIDATA/Scripts.txt + * @see https://www.w3schools.com/charsets/ref_utf_punctuation.asp + */ + public static final String WHITESPACE = + String.valueOf("\t")+ // char tabulator + String.valueOf(LF)+ // LF 0x000A + String.valueOf(Character.toChars(0x000B))+ // line tab + String.valueOf(Character.toChars(0x000C))+ // FF + String.valueOf(CR)+ // CR 0x000D + String.valueOf(SPACE)+ // SPACE 0x0020 + String.valueOf(Character.toChars(0x0085))+ // Next Line + String.valueOf(Character.toChars(0x00A0))+ // No-break space (NBSP) + String.valueOf(Character.toChars(0x1680))+ // Ogham Space Mark + String.valueOf(Character.toChars(0x2000))+ // En Quad + String.valueOf(Character.toChars(0x2001))+ // Em Quad + String.valueOf(Character.toChars(0x2002))+ // En Space + String.valueOf(Character.toChars(0x2003))+ // Em Space + String.valueOf(Character.toChars(0x2004))+ // Three-Per Em-Space + String.valueOf(Character.toChars(0x2005))+ // Four-Per Em-Space + String.valueOf(Character.toChars(0x2006))+ // Six-Per Em-Space + String.valueOf(Character.toChars(0x2007))+ // Figure-Space + String.valueOf(Character.toChars(0x2008))+ // Punctuation-Space + String.valueOf(Character.toChars(0x2009))+ // Thin-Space + String.valueOf(Character.toChars(0x200A))+ // Hair-Space + String.valueOf(Character.toChars(0x202F))+ // Narrow No-break space (NNBSP) + String.valueOf(Character.toChars(0x205F))+ // Medium Mathematical space (MMSP) + String.valueOf(Character.toChars(0x3000)); // Ideographic Space + + + /** Return true if given codepoint in included within {@link #WHITESPACE}. */ + public static boolean isWhitespace(final int cp) { + return 0 <= WHITESPACE.indexOf(cp); + } + + /** + * Returns true if given codepoint is a fullwidth unicode character. + * @see https://www.compart.com/en/unicode/block/U+FF00 + */ + public static boolean isFullwidth(final int cp) { + return 0xff01 <= cp && cp <= 0xff60; + } + /** + * Returns true if given codepoint is a halfwidth unicode character. + * @see https://www.compart.com/en/unicode/block/U+FF00 + */ + public static boolean isHalfwidth(final int cp) { + return 0xff61 == cp && cp <= 0xffee; + } + + /** Returns true if given codepoint is either {@link #isFullwidth(int)}, {@link #isHalfwidth(int)} or {@link #isWhitespace(int)}. */ + public static boolean hasSpace(final int cp) { + return isFullwidth(cp) || isHalfwidth(cp) || 0 <= WHITESPACE.indexOf(cp); + } + + /** Returns number of lines, i.e. number of non-empty lines, separated by {@link #LF}. */ + public static int getLineCount(final CharSequence s) { + if( null == s ) { + return 0; + } + final int len = s.length(); + if( 0 == len ) { + return 0; + } + int lc = 0; + for (int i=0; len > i; ) { + ++lc; + final int j = indexOf(s, LF, i); + if ( 0 > j ) { + break; + } + i = j + 1; + } + return lc; + } + + /** + * Calls {@link String#indexOf(int, int)} + * @param hay the unicode character string to search in from {@code fromIdx} + * @param needle the unicode code point character to search + * @param start index to start searching + * @return {@code -1} if not found, otherwise [0..{@link String#length()}-1]. + * @see #indexOf(CharSequence, int, int) + * @see String#indexOf(int, int) + */ + public static int indexOf(final String hay, final int needle, final int start) { + if( null != hay ) { + return hay.indexOf(needle, start); + } + return -1; + } + /** + * Naive implementation of {@link String#indexOf(int, int)} for type {@link CharSequence}. + * <p> + * Uses {@link String#indexOf(int, int)} if {@code hay} is of type {@link String}, + * otherwise + * </p> + * @param hay the unicode character string to search in from {@code fromIdx} + * @param needle the unicode code point character to search + * @param start index to start searching + * @return {@code -1} if not found, otherwise [0..{@link String#length()}-1]. + * @see #indexOf(String, char, int) + * @see String#indexOf(int, int) + */ + public static int indexOf(final CharSequence hay, final int needle, final int start) { + if( null != hay ) { + if (hay instanceof String) { + return ((String) hay).indexOf(needle, start); + } + final int l = hay.length(); + final int s = Math.max(0, start); + if ( l > s ) { + if (needle < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + for (int i = s; i < l; i++) { + if (hay.charAt(i) == needle) { + return i; + } + } + } else if (Character.isValidCodePoint(needle)) { + final char[] chars = Character.toChars(needle); + for (int i = s; i < l - 1; i++) { + final char hi = hay.charAt(i); // Character.toCodePoint(hi, lo); + final char lo = hay.charAt(i + 1); + if (hi == chars[0] && lo == chars[1]) { + return i; + } + } + } + } + } + return -1; + } + + /** + * Remove all given separator unicode characters from given text, + * i.e. all leading, all ending as well as duplicate consecutive separator within. + * The latter reduces the separator to just a single one. + * @param text the source text + * @param separators separator unicode characters, pass {@code null} for {@link Character#isWhitespace(int) whitespace}. + * Consider using {@link #WHITESPACE} to cover all unicode space character. + * @param replacement optional replacement string for matched separator within sequence removing duplicated. + * If {@code null}, the first found separator is used. + * @return stripped text + */ + public static String strip(final String text, final String separators, final String replacement) { + if (text == null ) { + return ""; + } + final int len = text.length(); + if (len == 0) { + return ""; + } + final StringBuilder sb = new StringBuilder(); + boolean lastMatched = false; + if (separators == null) { + for(int i=0; i < len; ++i) { + final int cp = text.codePointAt(i); + final boolean match = Character.isWhitespace(cp); + if ( !match || !lastMatched && 0 < i ) { + if( match && null != replacement ) { + sb.append(replacement); + } else { + sb.append(String.valueOf(Character.toChars(cp))); + } + } + lastMatched = match; + } + if(lastMatched) { + sb.deleteCharAt(sb.length()-1); + } + } else { + for(int i=0; i < len; ++i) { + final int cp = text.codePointAt(i); + final boolean match = separators.indexOf(cp) >= 0; + if ( !match || !lastMatched && 0 < i ) { + if( match && null != replacement ) { + sb.append(replacement); + } else { + sb.append(String.valueOf(Character.toChars(cp))); + } + } + lastMatched = match; + } + if(lastMatched) { + sb.deleteCharAt(sb.length()-1); + } + } + return sb.toString(); + } + + /** + * Returns an array of split {@code text} at {@code separators} or {@link Character#isWhitespace(int) whitespace}. + * <p> + * Each line's cutting point is the first {@code separator} or {@link Character#isWhitespace(int) whitespace} + * occurrence starting at {@code text.length() / lineCount * 0.9}. + * </p> + * <p> + * The separator or {@link Character#isWhitespace(int) whitespace} character + * at the cutting point is skipped in the resulting array of the split parts, i.e. lines. + * </p> + * @param text the text to be split, {@code null} results in an empty list + * @param lineCount number of resulting lines + * @param separators separator unicode characters, pass {@code null} for {@link Character#isWhitespace(int) whitespace}. + * Consider using {@link #WHITESPACE} to cover all unicode space character. + * @see #split(String, int, String, String) + */ + public static List<String> split(final String text, final int lineCount, final String separators) { + final List<String> list = new ArrayList<>(); + if (text == null || 0 == lineCount) { + return list; + } + final int len = text.length(); + if (len == 0) { + return list; + } + if( 1 == lineCount ) { + list.add(text); + return list; + } + final int segLen = (int)Math.ceil((float)len / (float)lineCount * 0.9f); + + int i = segLen; + int start = 0; + if (separators == null) { + while (i < len && list.size() < lineCount - 1) { + if (Character.isWhitespace(text.codePointAt(i))) { + list.add(text.substring(start, i)); + start = i+1; // skip separator + i += segLen; + } else { + i++; + } + } + } else { + while (i < len && list.size() < lineCount - 1) { + if (separators.indexOf(text.codePointAt(i)) >= 0) { + list.add(text.substring(start, i)); + start = i+1; // skip separator + i += segLen; + } else { + i++; + } + } + } + if( start < len ) { + list.add(text.substring(start, len)); + } + return list; + } + /** + * Returns a multi-line string of split {@code text} at {@code separators} or {@link Character#isWhitespace(int) whitespace} + * glued with given {@code lineSeparator}. + * <p> + * Each line's cutting point is the first {@code separator} or {@link Character#isWhitespace(int) whitespace} + * occurrence starting at {@code text.length() / lineCount * 0.9}. + * </p> + * <p> + * The separator character or {@link Character#isWhitespace(int) whitespace} + * at the cutting point is skipped in the string of glued split parts, i.e. lines. + * </p> + * @param text the text to be split, {@code null} results in an empty list + * @param lineCount number of resulting lines + * @param separators separator unicode characters, pass {@code null} for {@link Character#isWhitespace(int) whitespace}. + * Consider using {@link #WHITESPACE} to cover all unicode space character. + * @param lineSeparator the glue placed between the split lines in the concatenated result + * @see #split(String, int, String) + */ + public static String split(final String text, final int lineCount, final String separators, final String lineSeparator) { + final List<String> lines = split(text, lineCount, separators); + final StringBuilder sb = new StringBuilder(); + boolean addGlue = false; + for(final String l : lines) { + if( addGlue ) { + sb.append(lineSeparator); + } + sb.append(l); + addGlue = true; + } + return sb.toString(); + } + +} |