Java tutorial
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * JFlex 1.6 Unicode plugin * * Copyright (c) 2008 Steve Rowe <steve_rowe@users.sf.net> * * * * All rights reserved. * * * * License: BSD * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ package jflex.maven.plugin.unicode; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.maven.plugin.logging.Log; /** * This class is instantiated for each version of Unicode for which data can be found at * unicode.org. This class is responsible for parsing the following Unicode data files: * * <ul> * <li>UnicodeData(-X.X.X).txt * <li>PropertyAliases(-X.X.X).txt * <li>PropertyValueAliases(-X.X.X).txt * <li>DerivedCoreProperties(-X.X.X).txt * <li>Scripts(-X.X.X).txt * <li>Blocks(-X.X.X).txt * <li>PropList(-X.X.X).txt * <li>ScriptExtensions(-X.X.X).txt * <li>LineBreak(-X.X.X).txt * <li>GraphemeBreakProperty(-X.X.X).txt * <li>SentenceBreakProperty(-X.X.X).txt * <li>WordBreakProperty(-X.X.X).txt * <li>DerivedAge(-X.X.X).txt * </ul> * * and for emitting (into the source file generated by jflex-unicode-maven-plugin: * UnicodeProperties.java) the set of Property Values supported by this version of Unicode, as well * as the corresponding code point interval sets for each member of the Property Value set. */ class UnicodeVersion { /** Pattern for the full Unicode version */ private static final Pattern FULL_VERSION_PATTERN = Pattern.compile("((\\d+)\\.(\\d+))\\.\\d+"); /** Pattern for the full Unicode version from the unicode data URL */ private static final Pattern FULL_VERSION_PATTERN_IN_URL = Pattern .compile("UnicodeData-((\\d+)\\.(\\d+)\\.\\d+)\\.txt$"); /** Pattern used to normalize property value identifiers */ private static final Pattern WORD_SEP_PATTERN = Pattern.compile("[-_\\s()]"); /** Normalized General_Category property name */ private static final String NORMALIZED_GENERAL_CATEGORY = normalize("General_Category"); /** Normalized Script property name */ private static final String NORMALIZED_SCRIPT = normalize("Script"); /** The number of code point ranges to output per line in UnicodeProperties.java. */ private static final int INTERVALS_PER_LINE = 4; /** * The number of alias/target property value pairs to output per line in UnicodeProperties.java. */ private static final int PROPERTY_VALUE_ALIAS_MAPPINGS_PER_LINE = 2; /** The number of property values to output per line in UnicodeProperties.java. */ private static final int PROPERTY_VALUES_PER_LINE = 6; /** The number of caseless match partitions to output per line in UnicodeProperties.java. */ private static final int CASELESS_MATCH_PARTITIONS_PER_LINE = 3; /** The property values that represent surrogates [U+D800-U+DFFF] */ private static final Pattern SURROGATE_PATTERN = Pattern.compile("^cs$|surrogate", Pattern.CASE_INSENSITIVE); /** Unicode version X.X.X */ String majorMinorUpdateVersion; /** Unicode version X.X */ String majorMinorVersion; /** Unicode major version */ int majorVersion; /** Unicode minor version */ int minorVersion; /** The greatest code point listed in UnicodeData(-X.X.X).txt */ int maximumCodePoint; /** Maps Unicode property values to the associated set of code point ranges. */ SortedMap<String, NamedRangeSet> propertyValueIntervals = new TreeMap<>(); /** Stores encountered enumerated property names and values */ Map<String, Set<String>> usedEnumeratedProperties = new HashMap<>(); /** Stores encountered binary property names */ Set<String> usedBinaryProperties = new HashSet<>(); /** Stores all defined property name aliases */ Map<String, Set<String>> allPropertyAliases = new HashMap<>(); /** Stores all defined property value aliases */ Map<String, Map<String, Set<String>>> allPropertyValueAliases = new HashMap<>(); /** Maps property aliases to their corresponding canonical property names */ Map<String, String> propertyAlias2CanonicalName = new HashMap<>(); /** Maps property value aliases to their corresponding canonical property values */ Map<String, Map<String, String>> propertyValueAlias2CanonicalValue = new HashMap<>(); /** * A set of code point space partitions, each containing at least two caselessly equivalent code * points. */ Map<Integer, SortedSet<Integer>> caselessMatchPartitions = new HashMap<>(); /** The maximum size of the partitions in {@link #caselessMatchPartitions}. */ int caselessMatchPartitionSize = 0; private EnumMap<DataFileType, URL> dataFiles; /** * Instantiates a container for versioned Unicode data. * * @param version The Unicode version, either in form "X.X.X" or "X.X". * @param dataFiles Set of unicode data file types and corresponding URLs to be fetched and * parsed. */ UnicodeVersion(String version, EnumMap<DataFileType, URL> dataFiles) { this.dataFiles = dataFiles; setVersions(version, dataFiles.get(DataFileType.UNICODE_DATA)); } /** * Fetches and parses the data files defined for this Unicode version. * * @param log Where to put info about which files have been fetched and parsed * @throws IOException If there is a problem fetching or parsing any of this version's data files. */ public void fetchAndParseDataFiles(Log log) throws IOException { // Use the enum ordering to process in the correct order for (EnumMap.Entry<DataFileType, URL> entry : dataFiles.entrySet()) { DataFileType fileType = entry.getKey(); URL url = entry.getValue(); log.info("\t\tFetching/parsing: " + url.getPath()); fileType.scan(url, this); log.info("\t\tCompleted: " + url.getPath()); } } /** * Fills in majorMinorVersion and majorMinorUpdateVersion based on the passed in version string. * Also fills in isArchaicPropListFormat, based on the majorMinorVersion. * * @param version The Unicode version, in form "X.X.X" or "X.X". * @param unicodeDataURL The URL at which UnicodeData(-X.X.X).txt is located. */ private void setVersions(String version, URL unicodeDataURL) { Matcher matcher = FULL_VERSION_PATTERN.matcher(version); if (matcher.matches()) { majorMinorUpdateVersion = matcher.group(0); majorMinorVersion = matcher.group(1); majorVersion = Integer.parseInt(matcher.group(2)); minorVersion = Integer.parseInt(matcher.group(3)); } else { majorMinorVersion = version; matcher = FULL_VERSION_PATTERN_IN_URL.matcher(unicodeDataURL.toString()); if (matcher.find()) { majorMinorUpdateVersion = matcher.group(1); majorVersion = Integer.parseInt(matcher.group(2)); minorVersion = Integer.parseInt(matcher.group(3)); } } } public void emitToDir(File outputDir) throws IOException { String generatedClassName = getGeneratedClassName(); PrintWriter writer = new PrintWriter(new File(outputDir, generatedClassName + ".java"), "UTF-8"); writer.append("package jflex.unicode.data;\n\n"); writer.append("public class ").append(generatedClassName).append(" {\n"); emitConstructor(writer); emitMaximumCodePoint(writer); emitPropertyValuesArray(writer); emitIntervalsArray(writer); emitPropertyValueAliasesArray(writer); emitCaselessMatchPartitions(writer); writer.append("}\n"); writer.close(); } private void emitConstructor(PrintWriter writer) { } /** * Grows the partition containing the given codePoint and its caseless equivalents, if any, to * include all of them. * * @param codePoint The code point to include in a caselessly equivalent partition * @param uppercaseMapping A hex String representation of the uppercase mapping of codePoint, or * null if there isn't one * @param lowercaseMapping A hex String representation of the lowercase mapping of codePoint, or * null if there isn't one * @param titlecaseMapping A hex String representation of the titlecase mapping of codePoint, or * null if there isn't one */ void addCaselessMatches(int codePoint, String uppercaseMapping, String lowercaseMapping, String titlecaseMapping) { if ((null != uppercaseMapping && uppercaseMapping.length() > 0) || (null != lowercaseMapping && lowercaseMapping.length() > 0) || (null != titlecaseMapping && titlecaseMapping.length() > 0)) { Integer upper = null; if (null != uppercaseMapping && uppercaseMapping.length() > 0) upper = Integer.valueOf(uppercaseMapping, 16); else uppercaseMapping = null; Integer lower = null; if (null != lowercaseMapping && lowercaseMapping.length() > 0) lower = Integer.valueOf(lowercaseMapping, 16); else lowercaseMapping = null; Integer title = null; if (null != titlecaseMapping && titlecaseMapping.length() > 0) title = Integer.valueOf(titlecaseMapping, 16); else titlecaseMapping = null; SortedSet<Integer> partition = caselessMatchPartitions.get(codePoint); if (null == partition) { if (null != uppercaseMapping) partition = caselessMatchPartitions.get(upper); if (null == partition && null != lowercaseMapping) partition = caselessMatchPartitions.get(lower); if (null == partition && null != titlecaseMapping) partition = caselessMatchPartitions.get(title); if (null == partition) { partition = new TreeSet<>(); } } partition.add(codePoint); caselessMatchPartitions.put(codePoint, partition); if (null != uppercaseMapping) { partition.add(upper); caselessMatchPartitions.put(upper, partition); } if (null != lowercaseMapping) { partition.add(lower); caselessMatchPartitions.put(lower, partition); } if (null != titlecaseMapping) { partition.add(title); caselessMatchPartitions.put(title, partition); } if (partition.size() > caselessMatchPartitionSize) { caselessMatchPartitionSize = partition.size(); } } } /** * Given a binary property name, and starting and ending code points, adds the interval to the * {@link #propertyValueIntervals} map. * * @param propName The property name, e.g. "Assigned". * @param startCodePoint The first code point in the interval. * @param endCodePoint The last code point in the interval. */ void addInterval(String propName, int startCodePoint, int endCodePoint) { propName = getCanonicalPropertyName(normalize(propName)); if (!SURROGATE_PATTERN.matcher(propName).find()) { List<NamedRange> ranges = removeSurrogates(startCodePoint, endCodePoint); if (!ranges.isEmpty()) { NamedRangeSet intervals = propertyValueIntervals.get(propName); if (null == intervals) { intervals = new NamedRangeSet(); propertyValueIntervals.put(propName, intervals); } for (NamedRange range : ranges) { // UnicodeData-1.1.5.txt does not list the end point for the Unified Han // range (starting point is listed as U+4E00). This is U+9FFF according // to <http://unicode.org/Public/TEXT/OLDAPIX/CHANGES.TXT>: // // U+4E00 ^ U+9FFF 20,992 I-ZONE Ideographs // // U+4E00 is listed in UnicodeData-1.1.5.txt as having the "Lo" property // value, as are the previous code points, so to include // [ U+4E00 - U+9FFF ], this interval should be extended to U+9FFF. if (range.end == 0x4E00 && Objects.equals(majorMinorVersion, "1.1")) { range.end = 0x9FFF; } intervals.add(new NamedRangeSet(range)); } usedBinaryProperties.add(propName); } } } /** * Returns 0, 1, or 2 ranges for the given interval, depending on whether it is contained within; * is entirely outside of or starts or ends within; or straddles the surrogate range * 0xD800-0xDFFF, respectively. */ List<NamedRange> removeSurrogates(int startCodePoint, int endCodePoint) { assert startCodePoint <= endCodePoint; if (startCodePoint >= 0xD800 && endCodePoint <= 0xDFFF) { return Collections.emptyList(); } List<NamedRange> ranges = new ArrayList<>(); if (endCodePoint < 0xD800 || startCodePoint > 0xDFFF) { ranges.add(new NamedRange(startCodePoint, endCodePoint)); return ranges; } if (startCodePoint < 0xD800) { ranges.add(new NamedRange(startCodePoint, 0xD7FF)); } if (endCodePoint > 0xDFFF) { ranges.add(new NamedRange(0xE000, endCodePoint)); } return ranges; } /** * Given an enumerated property name and value, and starting and ending code points, adds the * interval to the {@link #propertyValueIntervals} map. * * @param propName The property name, e.g. "General_Category". * @param propValue The property value, e.g. "Lu" * @param startCodePoint The first code point in the interval. * @param endCodePoint The last code point in the interval. */ void addInterval(String propName, String propValue, int startCodePoint, int endCodePoint) { propName = getCanonicalPropertyName(normalize(propName)); propValue = getCanonicalPropertyValue(propName, normalize(propValue)); // Skip surrogate properties [U+D800-U+DFFF], e.g. \p{Cs} - can't be // represented in valid UTF-16 encoded strings if (!SURROGATE_PATTERN.matcher(propValue).find()) { List<NamedRange> ranges = removeSurrogates(startCodePoint, endCodePoint); if (!ranges.isEmpty()) { String canonicalValue = propName + '=' + propValue; if (Objects.equals(propName, NORMALIZED_GENERAL_CATEGORY) || Objects.equals(propName, NORMALIZED_SCRIPT)) { canonicalValue = propValue; } NamedRangeSet intervals = propertyValueIntervals.get(canonicalValue); if (null == intervals) { intervals = new NamedRangeSet(); propertyValueIntervals.put(canonicalValue, intervals); } for (NamedRange range : ranges) { // Unicode 2.0 has an error in Blocks-1.txt: two ranges overlap. // Since the single char in the second range (U+FEFF) is not an // Arabic character, but rather the zero-width no-break space char, // the FE70..FEFF block should be shortened to exclude this char; // this error is corrected in all following Unicode versions of // Blocks(-X|-X.X.X.).txt. // // FE70; FEFF; Arabic Presentation Forms-B // ... // FEFF; FEFF; Specials if (range.start == 0xFE70 && range.end == 0xFEFF && Objects.equals(majorMinorVersion, "2.0")) { range.end = 0xFEFE; } intervals.add(new NamedRangeSet(range)); } Set<String> usedValues = usedEnumeratedProperties.get(propName); if (null == usedValues) { usedValues = new HashSet<>(); usedEnumeratedProperties.put(propName, usedValues); } usedValues.add(propValue); // Initial letters of two-letter General Category property values // should be put on the used property values list if (Objects.equals(propName, NORMALIZED_GENERAL_CATEGORY) && propValue.length() == 2) { String firstLetter = propValue.substring(0, 1); usedValues.add(firstLetter); } } } } /** * Emits an int declaration and definition for the maximum code point listed for this version in * UnicodeData(-X.X.X).txt. * * @param writer Where to emit the maximum code point. */ void emitMaximumCodePoint(PrintWriter writer) { writer.append(" public static final int maximumCodePoint").append(" = 0x") .append(Integer.toString(maximumCodePoint, 16)).append(";\n"); } /** * Emits an array declaration and definition for the set of property values supported by this * version of Unicode. * * @param writer Where to emit the property values array. */ void emitPropertyValuesArray(PrintWriter writer) { writer.append(" public static final String[] propertyValues").append("\n = { "); int item = 0; for (String propValue : propertyValueIntervals.keySet()) { if (++item == PROPERTY_VALUES_PER_LINE) { writer.append(",\n "); item = 1; } else if (item > 1) { writer.append(", "); } writer.append("\"").append(propValue).append("\""); } writer.append(" };\n"); } /** * Emits an array declaration and definition for the set of code point ranges in this version of * Unicode, corresponding to and in the same order as the array of property values emitted in * {@link #emitPropertyValuesArray(java.io.PrintWriter)}. * * <p>Note that String form is required for the amount of data associated with the existing * Unicode versions - when coded as static two-dimensional arrays of int, the Java byte compiler * complains that "code too large". This is apparently due to size limits in the specification for * Java .class format. * * @param writer Where to emit the intervals array */ void emitIntervalsArray(PrintWriter writer) { writer.append(" public static final String[] intervals = {\n"); boolean isFirst = true; for (SortedMap.Entry<String, NamedRangeSet> entry : propertyValueIntervals.entrySet()) { String propertyValue = entry.getKey(); NamedRangeSet intervals = entry.getValue(); if (isFirst) { isFirst = false; } else { writer.append(",\n"); } writer.append(" // Unicode ").append(majorMinorVersion).append(" property value: {") .append(propertyValue).append("}\n"); int count = 0; boolean isFirstIntervalLine = true; for (NamedRange interval : intervals.getRanges()) { if (interval.start <= getMaximumCodePoint()) { if (++count > INTERVALS_PER_LINE) { writer.append("\n"); count = 1; } if (count == 1) { writer.append(isFirstIntervalLine ? " \"" : " + \""); } else { writer.append("+\""); } isFirstIntervalLine = false; emitEscapedUTF16Char(writer, interval.start); emitEscapedUTF16Char(writer, Math.min(interval.end, getMaximumCodePoint())); writer.append("\""); } } } writer.append(" };\n"); } /** * Populates a map of all possible aliases for the encountered properties and their values. ` * * @return a sorted map of all possible aliases for used properties & values */ SortedMap<String, String> getUsedPropertyValueAliases() { SortedMap<String, String> usedPropertyValueAliases = new TreeMap<>(); for (String binaryProperty : usedBinaryProperties) { for (String nameAlias : getPropertyAliases(binaryProperty)) { if (!Objects.equals(nameAlias, binaryProperty)) { usedPropertyValueAliases.put(nameAlias, binaryProperty); } } } Set<String> genCatProps = usedEnumeratedProperties.get(NORMALIZED_GENERAL_CATEGORY); if (null != genCatProps) { genCatProps.add("lc"); } for (Map.Entry<String, Set<String>> entry : usedEnumeratedProperties.entrySet()) { String propName = entry.getKey(); Set<String> propValues = entry.getValue(); for (String propValue : propValues) { String canonicalValue = propName + '=' + propValue; // Add value-only aliases for General Category and Script properties. if (Objects.equals(propName, NORMALIZED_SCRIPT) || Objects.equals(propName, NORMALIZED_GENERAL_CATEGORY)) { canonicalValue = propValue; for (String valueAlias : getPropertyValueAliases(propName, propValue)) { if (!Objects.equals(valueAlias, propValue)) { usedPropertyValueAliases.put(valueAlias, propValue); } } } for (String nameAlias : getPropertyAliases(propName)) { for (String valueAlias : getPropertyValueAliases(propName, propValue)) { // Both property names and values have self-aliases; when generating // all possible alias combinations, exclude the one that is the same // as the full property name + full property value, unless the // property is General Category or Script. if (Objects.equals(propName, NORMALIZED_SCRIPT) || Objects.equals(propName, NORMALIZED_GENERAL_CATEGORY) || !(Objects.equals(nameAlias, propName) && Objects.equals(valueAlias, propValue))) { String alias = nameAlias + '=' + valueAlias; usedPropertyValueAliases.put(alias, canonicalValue); } } } } } return usedPropertyValueAliases; } /** * Emits an array declaration and definition of alternating key/value mappings from property value * aliases to target property values. * * @param writer Where to emit the intervals array */ void emitPropertyValueAliasesArray(PrintWriter writer) { writer.append(" public static final String[] propertyValueAliases").append(" = {\n "); int count = 0; boolean isFirstLine = true; for (SortedMap.Entry<String, String> entry : getUsedPropertyValueAliases().entrySet()) { if (++count > PROPERTY_VALUE_ALIAS_MAPPINGS_PER_LINE) { writer.append(",\n "); count = 1; } else if (!isFirstLine) { writer.append(", "); } else { isFirstLine = false; } String alias = entry.getKey(); String propertyValue = entry.getValue(); writer.append("\"").append(alias).append("\", \"").append(propertyValue).append("\""); } writer.append("\n };\n"); } /** * Emits a constant assigned the size of each partition record (the maximum partition size) and a * string representing a sequence of fixed-length partition records. For partitions smaller than * the maximum size, the unused fields are populated with \u0000. * * @param writer Where to emit the caseless match partitions */ void emitCaselessMatchPartitions(PrintWriter writer) { writer.append(" public static final int caselessMatchPartitionSize").append(" = ") .append(Integer.toString(caselessMatchPartitionSize)).append(";\n"); writer.append(" public static final String caselessMatchPartitions").append(" =\n"); // Putting all of the partitions into a set ensures there are no duplicates SortedMap<Integer, SortedSet<Integer>> partitions = new TreeMap<>(); for (SortedSet<Integer> partition : caselessMatchPartitions.values()) { partitions.put(partition.first(), partition); } int count = 0; boolean isFirstPartitionLine = true; for (SortedSet<Integer> partition : partitions.values()) { if (++count > CASELESS_MATCH_PARTITIONS_PER_LINE) { writer.append("\n"); count = 1; } if (count == 1) writer.append(isFirstPartitionLine ? " \"" : " + \""); else writer.append("+\""); isFirstPartitionLine = false; for (Integer c : partition) emitEscapedUTF16Char(writer, c); // Add \u0000 placeholders to fill out the fixed record size for (int i = 0; i < caselessMatchPartitionSize - partition.size(); ++i) emitEscapedUTF16Char(writer, 0); writer.append("\""); } writer.append(";\n"); } /** * Emits an escaped character: * * <ul> * <li>in form "\\uXXXX", where XXXX is the hexadecimal form of the code point, if the given * point is in the Basic Multilingual Plane (BMP); or * <li>in form "\\uXXXX\\uYYYY", where XXXX and YYYY are the high and low surrogates, * respectively, representing the given point in UTF-16 form, if the given code point is * above the BMP. * </ul> * * @param writer Where to emit the escaped character. * @param codePoint The code point for which to emit an escaped character. */ private void emitEscapedUTF16Char(PrintWriter writer, int codePoint) { if (codePoint <= 0xFFFF) { emitEscapedBMPChar(writer, codePoint); } else { // codePoint > 0xFFFF - above the BMP if (codePoint <= 0x10FFFF) { for (char surrogate : Character.toChars(codePoint)) emitEscapedBMPChar(writer, (int) surrogate); } else { writer.append("<").append(Integer.toHexString(codePoint)).append(">"); } } } /** * Emits an escaped character in the form "\\uXXXX", where XXXX is the hexadecimal form of the * given code point, which must be in the Basic Multilingual Plane (BMP). Called from {@link * #emitEscapedUTF16Char(PrintWriter,int)} * * @param writer Where to emit the escaped character. * @param codePoint The code point for which to emit an escaped character. */ private void emitEscapedBMPChar(PrintWriter writer, int codePoint) { switch (codePoint) { // Special treatment for the quotation mark (U+0022). "\u0022" triggers // a syntax error when it is included in a literal string, because it is // interpreted as "[...]"[...]" (literally), and leads the compiler to // think that the enclosing quotation marks are unbalanced. case 0x22: writer.append("\\\""); break; case 0x0: writer.append("\\000"); break; case 0x9: writer.append("\\t"); break; case 0xA: writer.append("\\n"); break; case 0xC: writer.append("\\f"); break; case 0xD: writer.append("\\r"); break; case 0x5C: writer.append("\\\\"); break; default: writer.append(String.format("\\u%04x", codePoint)); } } /** * Returns an class name for the unicode version, suffixed the Unicode major.minor version * * @return "Unicode_X_Y", where X = major version, and Y = minor version. */ String getGeneratedClassName() { return String.format("Unicode_%s", majorMinorVersion.replace(".", "_")); } /** * Transforms mixed case identifiers containing spaces, hyphens, and/or underscores by downcasing * and removing all spaces, hyphens, underscores, and parentheses; also, converts property * name/value separator ':' to '='. * * @param identifier The identifier to transform * @return The transformed identifier */ static String normalize(String identifier) { if (null == identifier) return identifier; String normalized = WORD_SEP_PATTERN.matcher(identifier.toLowerCase(Locale.ENGLISH)).replaceAll(""); return normalized.replace(':', '='); } /** * Called from {@link #getUsedPropertyValueAliases()} to get a set of aliases for the given * property name. If none exists, an empty set is returned. * * @param propertyName The property name for which to lookup aliases. * @return the aliases for the given property name; if none exists, a set containing the given * property name is returned. */ Set<String> getPropertyAliases(String propertyName) { Set<String> aliases = allPropertyAliases.get(propertyName); if (null == aliases) { aliases = new HashSet<>(Arrays.asList(propertyName)); } return aliases; } /** * Called from {@link #getUsedPropertyValueAliases()} to get a set of aliases for the given * property name and value. If none exists, an empty set is returned. * * @param propertyName The property name to use when looking up aliases for the given property * value * @param propertyValue The property value for which to lookup aliases. * @return the aliases for the given property name and value; if none exists, an empty set is * returned. */ Set<String> getPropertyValueAliases(String propertyName, String propertyValue) { Set<String> aliases = null; Map<String, Set<String>> values = allPropertyValueAliases.get(propertyName); if (null != values) { aliases = values.get(propertyValue); } if (null == aliases) { aliases = Collections.emptySet(); } return aliases; } /** * For the given property name or alias, returns the canonical property name. If none has been * encountered, then the given propertyAlias is returned. * * @param propertyAlias The property name or alias for which to lookup the canonical property * name. * @return the canonical property name for the given property name or alias. If none has been * encountered, then the given propertyAlias is returned. */ String getCanonicalPropertyName(String propertyAlias) { String canonicalName = null; propertyAlias = normalize(propertyAlias); if (null != propertyAlias2CanonicalName) { canonicalName = propertyAlias2CanonicalName.get(propertyAlias); } return null == canonicalName ? propertyAlias : canonicalName; } /** * For the given property name and property value (or property value alias), returns the canonical * property value. If none has been encountered, then the given propertyValueAlias is returned. * * @param propertyName The property name to use when looking up a property value * @param propertyValueAlias The property value (alias) for which to look up the canonical * property value. * @return the canonical property value for the given property name and property value (alias); if * none is found, the given propertyValueAlias is returned. */ String getCanonicalPropertyValue(String propertyName, String propertyValueAlias) { String canonicalValue = null; if (null != propertyValueAlias2CanonicalValue) { Map<String, String> valueAliases = propertyValueAlias2CanonicalValue.get(propertyName); if (null != valueAliases) { canonicalValue = valueAliases.get(propertyValueAlias); } } return null == canonicalValue ? propertyValueAlias : canonicalValue; } /** * Called from PropertyValueAliasesScanner to populate the property values and property value * aliases for a property. * * @param propertyAlias The alias for a property name (or the property itself) * @param propertyValue The property value for which to set the aliases * @param propertyValueAliases The aliases to set for the given propertyValue */ void addPropertyValueAliases(String propertyAlias, String propertyValue, Set<String> propertyValueAliases) { String propertyName = getCanonicalPropertyName(propertyAlias); propertyValue = normalize(propertyValue); propertyValueAliases.add(propertyValue); Map<String, Set<String>> propertyValue2Aliases = allPropertyValueAliases.get(propertyName); if (null == propertyValue2Aliases) { propertyValue2Aliases = new HashMap<>(); allPropertyValueAliases.put(propertyName, propertyValue2Aliases); } propertyValue2Aliases.put(propertyValue, propertyValueAliases); Map<String, String> aliasMap = propertyValueAlias2CanonicalValue.get(propertyName); if (null == aliasMap) { aliasMap = new HashMap<>(); propertyValueAlias2CanonicalValue.put(propertyName, aliasMap); } for (String propertyValueAlias : propertyValueAliases) { propertyValueAlias = normalize(propertyValueAlias); aliasMap.put(propertyValueAlias, propertyValue); } } /** * Sets the maximum code point for this Unicode version. * * @param maximumCodePoint The new maximum code point for this Unicode version */ void setMaximumCodePoint(int maximumCodePoint) { this.maximumCodePoint = maximumCodePoint; } /** * Returns the maximum code point for this Unicode version. * * @return the maximum code point for this Unicode version. */ public int getMaximumCodePoint() { return maximumCodePoint; } public void addCompatibilityProperties() { NamedRangeSet whitespaceRanges = propertyValueIntervals.get("whitespace"); if (null == whitespaceRanges) { // For Unicode 1.1, subsitute "Space_separator" (Zs) for "Whitespace" whitespaceRanges = propertyValueIntervals.get("zs"); } // UTR#18: \p{xdigit} = [\p{gc=Decimal_Number}\p{Hex_Digit}] // \p{gc=Decimal_Number} = \p{Nd} (available in all versions) NamedRangeSet xdigitSet = propertyValueIntervals.get("nd").copy(); NamedRangeSet hexDigitRanges = propertyValueIntervals.get("hexdigit"); if (null == hexDigitRanges) { // Hex_Digit was introduced in Unicode 2.0; handle for Unicode 1.1 // Hex_Digit contains 0-9 A-F, fullwidth and halfwidth, upper and lowercase. // \p{Nd} contains all required digit forms, so no need to add them here // Unicode 1.1 doesn't define HALFWIDTH latin letters (or digits) hexDigitRanges = new NamedRangeSet(); hexDigitRanges.add(new NamedRangeSet(new NamedRange((int) 'A', (int) 'F'))); hexDigitRanges.add(new NamedRangeSet(new NamedRange((int) 'a', (int) 'f'))); // FF21..FF26;FULLWIDTH LATIN CAPITAL LETTER A..F hexDigitRanges.add(new NamedRangeSet(new NamedRange(0xFF21, 0xFF26))); // FF41..FF46;FULLWIDTH LATIN SMALL LETTER A..F hexDigitRanges.add(new NamedRangeSet(new NamedRange(0xFF41, 0xFF46))); } xdigitSet.add(hexDigitRanges); propertyValueIntervals.put("xdigit", xdigitSet); usedBinaryProperties.add("xdigit"); // UTR#18: \p{alnum} = [\p{alpha}\p{digit}] // \p{alpha} = \p{Alphabetic} (available in all versions except 1.1) NamedRangeSet alnumSet; NamedRangeSet alphaRanges = propertyValueIntervals.get("alphabetic"); if (null == alphaRanges) { // For Unicode 1.1, substitute "Letter" (L) for "Alphabetic". // \p{L} = [\p{Ll}\p{Lu}\p{Lt}\p{Lm}\p{Lo}] alnumSet = propertyValueIntervals.get("ll").copy(); alnumSet.add(propertyValueIntervals.get("lm")); alnumSet.add(propertyValueIntervals.get("lo")); alnumSet.add(propertyValueIntervals.get("lu")); // Unicode 1.1 has no characters with the 'Lt' (titlecase letter) property. } else { alnumSet = alphaRanges.copy(); } // \p{digit} = \p{gc=Decimal_Digit} = \p{Nd} (available in all versions) alnumSet.add(propertyValueIntervals.get("nd")); propertyValueIntervals.put("alnum", alnumSet); // UTR#18: \p{blank} = [\p{Whitespace} // -- [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} // \p{gc=Line_Separator} \p{gc=Paragraph_Separator}]] NamedRangeSet blankSet = whitespaceRanges.copy(); // Subtract: [\N{LF}\N{VT}\N{FF}\N{CR}] = [U+000A-U+000D] blankSet.sub(new NamedRangeSet(new NamedRange(0xA, 0xD))); // Subtract: \N{NEL} blankSet.sub(new NamedRangeSet(new NamedRange(0x85, 0x85))); blankSet.sub(propertyValueIntervals.get("zl")); // \p{gc=Line_Separator} blankSet.sub(propertyValueIntervals.get("zp")); // \p{gc=Paragraph_Separator} propertyValueIntervals.put("blank", blankSet); usedBinaryProperties.add("blank"); // UTR#18: \p{graph} = [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] NamedRangeSet graphSet = new NamedRangeSet(new NamedRange(0x0, getMaximumCodePoint())); graphSet.sub(whitespaceRanges); graphSet.sub(propertyValueIntervals.get("cc")); // \p{gc=Control} graphSet.sub(propertyValueIntervals.get("cn")); // \p{gc=Unassigned} graphSet.sub(new NamedRangeSet(new NamedRange(0xD800, 0xDFFF))); propertyValueIntervals.put("graph", graphSet); usedBinaryProperties.add("graph"); // UTR#18: \p{print} = [\p{graph}\p{blank} -- \p{cntrl}] // \p{cntrl} = \p{gc=Control} = \p{gc=Cc} = \p{Cc} NamedRangeSet printSet = graphSet.copy(); printSet.add(blankSet); printSet.sub(propertyValueIntervals.get("cc")); propertyValueIntervals.put("print", printSet); usedBinaryProperties.add("print"); } }