Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.uima.lucas.indexer.analysis; import java.io.IOException; import java.text.Format; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.cas.Feature; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.jcas.tcas.Annotation; import com.google.common.base.Predicate; import com.google.common.collect.ImmutableBiMap; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; /** * * AnnotationTokenStream represents a TokenStream which extracts tokens from feature values of * annotations of a given type from a JCas object. Each token has the start and end offset from the * annotation object. This class supports only the following UIMA JCas types of features: * <ol> * <li>String</li> * <li>StringArray</li> * <li>FSArray</li> * <li>Number types</li> * </ol> */ public class AnnotationTokenStream extends TokenStream { private JCas jCas; private String featurePath; private List<String> featureNames; private String delimiter; private Iterator<Annotation> annotationIterator; // iterates over annotations private Iterator<FeatureStructure> featureStructureIterator; // iterates over feature structures // stored in feature arrays of an // annotation private Iterator<String> featureValueIterator; // iterates over the features of a feature // structure private Annotation currentAnnotation; private Type annotationType; private Map<String, Format> featureFormats; // a optional map of format object for each feature private static Logger logger = Logger.getLogger(AnnotationTokenStream.class); private class NotNullPredicate<T> implements Predicate<T> { public boolean apply(T object) { return object != null; } } /** * Creates a TokenStream which extracts all coveredText feature values of annotations of a given * type from a JCas object. Each token has the start and end offset of the annotation and takes * the covered text value as termText. * * @param jCas * the jCas * @param sofaName the name of the subject of analysis (sofa) * @param typeName * the type of the annotation * @throws CASException */ public AnnotationTokenStream(JCas jCas, String sofaName, String typeName) throws InvalidTokenSourceException { this(jCas, sofaName, typeName, null, Collections.<String>emptyList(), null, Collections.<String, Format>emptyMap()); } /** * Creates a TokenStream which extracts all feature values of a given feature name from * annotations with a given type from a given JCas object. Each token has the start and end offset * of the annotation and uses the feature value as term text. * * @param jCas * the JCas object * @param sofaName the name of the subject of analysis (sofa) * @param typeName * the type of the annotation * @param featureName * the name of the feature from which the token text is build * @param featureFormat * optional format object to convert feature values to strings * @throws InvalidTokenSourceException */ public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, String featureName, Format featureFormat) throws InvalidTokenSourceException { this(jCas, sofaName, typeName, null, Lists.newArrayList(featureName), null, featureFormat != null ? ImmutableBiMap.of(featureName, featureFormat) : Collections.<String, Format>emptyMap()); } /** * Creates a TokenStream which extracts all feature values of a given feature name list from * annotations with a given type from a given JCas object. Each token has the start and end offset * of the annotation and uses the concatenation of all the feature values as term text. Optionally * the different feature values of an annotation can be concatenated with a delimiter. * * @param jCas * the JCas object * @param sofaName the name of the Subject Of Analysis (sofa) * @param typeName * the type of the annotation * @param featureNames * the name of the feature from which the token text is build * @param delimiter * a delimiter for concatenating the different feature values of an annotation object. If * null a white space will be used. * @param featureFormats * optional map of format objects to convert feature values to strings - the key must be * the feature name * @throws InvalidTokenSourceException */ public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, List<String> featureNames, String delimiter, Map<String, Format> featureFormats) throws InvalidTokenSourceException { this(jCas, sofaName, typeName, null, featureNames, delimiter, featureFormats); } /** * Creates a TokenStream which extracts all feature values of a given feature name list from * annotations with a given type from a given JCas object. Each token has the start and end offset * of the annotation and uses the concatenation of all the feature values as term text. * * @param jCas * the JCas object * @param sofaName the name of the Subject Of Analysis (sofa) * @param typeName * the type of the annotation * @param featureNames * the name of the feature from which the token text is build * @param featureFormats * optional map of format objects to convert feature values to strings - the key must be * the feature name * @throws InvalidTokenSourceException */ public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, List<String> featureNames, Map<String, Format> featureFormats) throws InvalidTokenSourceException { this(jCas, sofaName, typeName, null, featureNames, null, featureFormats); } /** * Creates a TokenStream which extracts all feature values of a given feature name list from * annotations with a given type from a given JCas object. The addressed features are part of * direct or indirect feature structure value of a annotation. For example a annotation of type * person has a feature address which values are address feature structures with features for the * street, postal code and city . To create tokens with postal code and city of a persons address, * the featurePath must be "address" and the featureNames "postalCode" and * "city". Each token has the start and end offset of the annotation and uses the * concatenation of all the feature values as term text. * * @param jCas * the JCas object * @param sofaName the name of the Subject of Analysis (sofa) * @param typeName * the type of the annotation * @param featurePath * the path to the feature structures which features should be used for tokens Path * entries should be separated by ".". Example: * "affiliation.address.country" * @param featureNames * the name of the feature from which the token text is build * @param featureFormats * optional map of format objects to convert feature values to strings - the key must be * the feature name * @throws InvalidTokenSourceException */ public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, String featurePath, List<String> featureNames, Map<String, Format> featureFormats) throws InvalidTokenSourceException { this(jCas, sofaName, typeName, featurePath, featureNames, null, featureFormats); } /** * Creates a TokenStream which extracts all feature values of a given feature name list from * annotations with a given type from a given JCas object. The addressed features are part of * direct or indirect feature structure value of a annotation. For example a annotation of type * person has a feature address which values are address feature structures with features for the * street, postal code and city . To create tokens with postal code and city of a persons address, * the featurePath must be "address" and the featureNames "postalCode" and * "city". Each token has the start and end offset of the annotation and uses the * concatenation of all the feature values as term text. Optionally the different feature values * of an annotation can be concatenated with a delimiter. * * @param jCas * the JCas object * @param sofaName the name of the Subject of Analysis (sofa) * @param typeName * the type of the annotation * @param featurePath * the path to the feature structures which features should be used for tokens Path * entries should be separated by ".". Example: * "affiliation.address.country" * @param featureNames * the name of the feature from which the token text is build * @param delimiter * a delimiter for concatenating the different feature values of an annotation object. If * null a white space will be used. * @param featureFormats * optional map of format objects to convert feature values to strings - the key must be * the feature name * @throws InvalidTokenSourceException */ public AnnotationTokenStream(JCas jCas, String sofaName, String typeName, String featurePath, List<String> featureNames, String delimiter, Map<String, Format> featureFormats) throws InvalidTokenSourceException { super(); this.featurePath = featurePath; this.featureNames = featureNames; this.delimiter = delimiter; if (featureFormats == null) this.featureFormats = Collections.emptyMap(); else this.featureFormats = featureFormats; getSofaCas(jCas, sofaName); getTypeForName(typeName); validate(annotationType, featureNames, featurePath); initializeIterators(); } private void getTypeForName(String typeName) throws InvalidTokenSourceException { annotationType = jCas.getTypeSystem().getType(typeName); if (annotationType == null) throw new InvalidTokenSourceException("Type " + typeName + " not found!"); } private void getSofaCas(JCas cas, String sofaName) throws InvalidTokenSourceException { try { jCas = cas.getView(sofaName); } catch (CASException e) { throw new InvalidTokenSourceException(e); } } void validate(Type type, Collection<String> featureNames, String featurePath) throws InvalidTokenSourceException { Type typeToValidate = findTypeWithPath(type, featurePath); for (String featureName : featureNames) { Feature feature = typeToValidate.getFeatureByBaseName(featureName); if (feature == null) throw new InvalidTokenSourceException("Type " + typeToValidate.getName() + " has no feature " + featureName + ". featurePath: " + featurePath); } } private Type findTypeWithPath(Type type, String featurePath) throws InvalidTokenSourceException { if (featurePath == null) return type; String[] featurePathElements = featurePath.split("\\."); Type currentType = type; for (String featurePathElement : featurePathElements) { Feature feature = currentType.getFeatureByBaseName(featurePathElement); if (feature == null) throw new InvalidTokenSourceException( "Type " + currentType.getName() + " has no feature " + featurePathElement); currentType = feature.getRange(); if (currentType.isArray()) currentType = currentType.getComponentType(); } return currentType; } @Override public Token next(Token token) throws IOException { while (!featureValueIterator.hasNext()) { while (!featureStructureIterator.hasNext()) { if (!annotationIterator.hasNext()) return null; currentAnnotation = (Annotation) annotationIterator.next(); featureStructureIterator = createFeatureStructureIterator(currentAnnotation, featurePath); } featureValueIterator = createFeatureValueIterator(featureStructureIterator.next(), featureNames); } token.setStartOffset(currentAnnotation.getBegin()); token.setEndOffset(currentAnnotation.getEnd()); char[] value = featureValueIterator.next().toCharArray(); token.setTermBuffer(value, 0, value.length); return token; } /* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ @Override public Token next() throws IOException { return next(new Token()); } protected void initializeIterators() { annotationIterator = Iterators.filter(jCas.getAnnotationIndex(annotationType).iterator(), new NotNullPredicate<Annotation>()); if (!annotationIterator.hasNext()) { featureStructureIterator = Iterators.emptyIterator(); featureValueIterator = Iterators.emptyIterator(); return; } currentAnnotation = (Annotation) annotationIterator.next(); featureStructureIterator = createFeatureStructureIterator(currentAnnotation, featurePath); if (!featureStructureIterator.hasNext()) { featureValueIterator = Iterators.emptyIterator(); return; } FeatureStructure featureStructure = featureStructureIterator.next(); featureValueIterator = createFeatureValueIterator(featureStructure, featureNames); } protected Iterator<FeatureStructure> createFeatureStructureIterator(Annotation annotation, String featurePath) { Collection<FeatureStructure> featureStructures = new LinkedList<FeatureStructure>(); Collection<FeatureStructure> childs = new LinkedList<FeatureStructure>(); if (featurePath == null) { featureStructures.add(annotation); return featureStructures.iterator(); } Type currentType = annotation.getType(); if (currentType.isArray()) currentType = currentType.getComponentType(); String[] pathEntries = featurePath.split("\\."); featureStructures.add(annotation); for (String pathEntry : pathEntries) { Feature feature = currentType.getFeatureByBaseName(pathEntry); childs.clear(); if (feature.getRange().isArray()) { for (FeatureStructure featureStructureItem : featureStructures) { FSArray fsArray = (FSArray) featureStructureItem.getFeatureValue(feature); if (fsArray == null) continue; for (int i = 0; i < fsArray.size(); i++) childs.add(fsArray.get(i)); } } else for (FeatureStructure featureStructureItem : featureStructures) childs.add(featureStructureItem.getFeatureValue(feature)); currentType = feature.getRange(); if (currentType.isArray()) currentType = currentType.getComponentType(); featureStructures.clear(); featureStructures.addAll(childs); } return Iterators.filter(featureStructures.iterator(), new NotNullPredicate<FeatureStructure>()); } protected Iterator<String> createFeatureValueIterator(FeatureStructure srcFeatureStructure, Collection<String> featureNames) { List<String> values = new LinkedList<String>(); Type featureType = srcFeatureStructure.getType(); if (featureNames.size() == 0) values.add(currentAnnotation.getCoveredText()); for (String featureName : featureNames) { Feature feature = featureType.getFeatureByBaseName(featureName); if (feature.getRange().isArray()) { StringArray fsArray = (StringArray) srcFeatureStructure.getFeatureValue(feature); if (featureNames.size() == 1) { for (int i = 0; i < fsArray.size(); i++) values.add(fsArray.get(i).toString()); } else { String value = ""; for (int i = 0; i < fsArray.size(); i++) { value = value.concat(fsArray.get(i).toString()); if (i < fsArray.size() - 1) value = value.concat(delimiter); } values.add(value); } } else values.add(getValueForFeature(srcFeatureStructure, feature, featureFormats.get(feature.getShortName()))); } String value = ""; if (delimiter != null) { for (int i = 0; i < values.size(); i++) { if (values.get(i) == null) continue; value = value.concat(values.get(i)); if (i < values.size() - 1) value = value.concat(delimiter); } values.clear(); values.add(value); } return Iterators.filter(values.iterator(), new NotNullPredicate<String>()); } public String getValueForFeature(FeatureStructure featureStructure, Feature feature, Format format) { if (format == null) return featureStructure.getFeatureValueAsString(feature); else { Object value = null; if (feature.getRange().getName().equals(CAS.TYPE_NAME_DOUBLE)) value = featureStructure.getDoubleValue(feature); else if (feature.getRange().getName().equals(CAS.TYPE_NAME_FLOAT)) value = featureStructure.getFloatValue(feature); else if (feature.getRange().getName().equals(CAS.TYPE_NAME_LONG)) value = featureStructure.getLongValue(feature); else if (feature.getRange().getName().equals(CAS.TYPE_NAME_INTEGER)) value = featureStructure.getIntValue(feature); else if (feature.getRange().getName().equals(CAS.TYPE_NAME_SHORT)) value = featureStructure.getShortValue(feature); return format.format(value); } } public void reset() { featureStructureIterator = null; currentAnnotation = null; featureFormats = Collections.emptyMap(); initializeIterators(); } public Map<String, Format> getFeatureFormats() { return featureFormats; } public JCas getJCas() { return jCas; } public String getFeaturePath() { return featurePath; } public List<String> getFeatureNames() { return featureNames; } public String getDelimiter() { return delimiter; } public Type getAnnotationType() { return annotationType; } }