Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.alfresco.solr.AlfrescoFieldType.java

License:Open Source License

public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null)
        return null;

    TokenStream source = null;
    try {/*from   w  w  w. j a  v  a 2s  .c  om*/
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned no terms for multiTerm term: " + part);
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned too many terms for multiTerm term: " + part);

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

private ArrayList<String> getTokens(IndexableField indexableField) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();

    TokenStream ts = indexableField.tokenStream(schema.getIndexAnalyzer(), null);
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    ts.reset();// ww w  . ja  v  a2 s.c  o m
    while (ts.incrementToken()) {
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();

    return tokens;
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

@SuppressWarnings("unchecked")
protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode,
        LuceneFunction luceneFunction) throws ParseException, IOException {
    // make sure the field exists or return a dummy query so we have no
    // error ....ACE-3231
    SchemaField schemaField = schema.getFieldOrNull(field);
    boolean isNumeric = false;
    if (schemaField == null) {
        return new TermQuery(new Term("_dummy_", "_miss_"));
    } else {/*from ww w  . ja va  2  s .  c  om*/
        isNumeric = (schemaField.getType().getNumericType() != null);
        if (isNumeric) {
            //Check to see if queryText is numeric or else it will fail.
            try {
                Double.valueOf(queryText);
            } catch (NumberFormatException e) {
                return new TermQuery(new Term("_dummy_", "_miss_"));
            }
        }
    }

    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or noth

    // TODO: Untokenised columns with functions require special handling

    if (luceneFunction != LuceneFunction.FIELD) {
        throw new UnsupportedOperationException(
                "Field queries are not supported on lucene functions (UPPER, LOWER, etc)");
    }

    // if the incoming string already has a language identifier we strip it
    // iff and addit back on again

    String localePrefix = "";

    String toTokenise = queryText;

    if (queryText.startsWith("{")) {
        int position = queryText.indexOf("}");
        if (position > 0) {
            String language = queryText.substring(0, position + 1);
            Locale locale = new Locale(queryText.substring(1, position));
            String token = queryText.substring(position + 1);
            boolean found = false;
            for (Locale current : Locale.getAvailableLocales()) {
                if (current.toString().equalsIgnoreCase(locale.toString())) {
                    found = true;
                    break;
                }
            }
            if (found) {
                localePrefix = language;
                toTokenise = token;
            } else {
                // toTokenise = token;
            }
        }
    }

    String testText = toTokenise;
    boolean requiresMLTokenDuplication = false;
    String localeString = null;
    if (isPropertyField(field) && (localePrefix.length() == 0)) {
        if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) {
            int position = queryText.indexOf("\u0000", 1);
            testText = queryText.substring(position + 1);
            requiresMLTokenDuplication = true;
            localeString = queryText.substring(1, position);

        }
    }

    // find the positions of any escaped * and ? and ignore them

    Set<Integer> wildcardPoistions = getWildcardPositions(testText);

    TokenStream source = null;
    ArrayList<PackedTokenAttributeImpl> list = new ArrayList<PackedTokenAttributeImpl>();
    boolean severalTokensAtSamePosition = false;
    PackedTokenAttributeImpl nextToken;
    int positionCount = 0;

    try {
        source = getAnalyzer().tokenStream(field, new StringReader(toTokenise));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }

            list.add(nextToken);
            if (nextToken.getPositionIncrement() != 0)
                positionCount += nextToken.getPositionIncrement();
            else
                severalTokensAtSamePosition = true;
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }

    // add any alpha numeric wildcards that have been missed
    // Fixes most stop word and wild card issues

    for (int index = 0; index < testText.length(); index++) {
        char current = testText.charAt(index);
        if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) {
            StringBuilder pre = new StringBuilder(10);
            if (index == 0) {
                // "*" and "?" at the start

                boolean found = false;
                for (int j = 0; j < list.size(); j++) {
                    PackedTokenAttributeImpl test = list.get(j);
                    if ((test.startOffset() <= 0) && (0 < test.endOffset())) {
                        found = true;
                        break;
                    }
                }
                if (!found && (list.size() == 0)) {
                    // Add new token followed by * not given by the
                    // tokeniser
                    PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                    newToken.setEmpty().append("", 0, 0);
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        @SuppressWarnings("resource")
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale,
                                MLAnalysisMode.EXACT_LANGUAGE);
                        Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            } else if (index > 0) {
                // Add * and ? back into any tokens from which it has been
                // removed

                boolean tokenFound = false;
                for (int j = 0; j < list.size(); j++) {
                    PackedTokenAttributeImpl test = list.get(j);
                    if ((test.startOffset() <= index) && (index < test.endOffset())) {
                        if (requiresMLTokenDuplication) {
                            String termText = test.toString();
                            int position = termText.indexOf("}");
                            String language = termText.substring(0, position + 1);
                            String token = termText.substring(position + 1);
                            if (index >= test.startOffset() + token.length()) {
                                test.setEmpty();
                                test.append(language + token + current);
                            }
                        } else {
                            if (index >= test.startOffset() + test.length()) {
                                test.setEmpty();
                                test.append(test.toString() + current);
                            }
                        }
                        tokenFound = true;
                        break;
                    }
                }

                if (!tokenFound) {
                    for (int i = index - 1; i >= 0; i--) {
                        char c = testText.charAt(i);
                        if (Character.isLetterOrDigit(c)) {
                            boolean found = false;
                            for (int j = 0; j < list.size(); j++) {
                                PackedTokenAttributeImpl test = list.get(j);
                                if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                    found = true;
                                    break;
                                }
                            }
                            if (found) {
                                break;
                            } else {
                                pre.insert(0, c);
                            }
                        } else {
                            break;
                        }
                    }
                    if (pre.length() > 0) {
                        // Add new token followed by * not given by the
                        // tokeniser
                        PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                        newToken.setEmpty().append(pre.toString());
                        newToken.setOffset(index - pre.length(), index);
                        newToken.setType("ALPHANUM");
                        if (requiresMLTokenDuplication) {
                            Locale locale = I18NUtil.parseLocale(localeString);
                            @SuppressWarnings("resource")
                            MLTokenDuplicator duplicator = new MLTokenDuplicator(locale,
                                    MLAnalysisMode.EXACT_LANGUAGE);
                            Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
                            if (it != null) {
                                int count = 0;
                                while (it.hasNext()) {
                                    list.add(it.next());
                                    count++;
                                    if (count > 1) {
                                        severalTokensAtSamePosition = true;
                                    }
                                }
                            }
                        }
                        // content
                        else {
                            list.add(newToken);
                        }
                    }
                }
            }

            StringBuilder post = new StringBuilder(10);
            if (index > 0) {
                for (int i = index + 1; i < testText.length(); i++) {
                    char c = testText.charAt(i);
                    if (Character.isLetterOrDigit(c)) {
                        boolean found = false;
                        for (int j = 0; j < list.size(); j++) {
                            PackedTokenAttributeImpl test = list.get(j);
                            if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                found = true;
                                break;
                            }
                        }
                        if (found) {
                            break;
                        } else {
                            post.append(c);
                        }
                    } else {
                        break;
                    }
                }
                if (post.length() > 0) {
                    // Add new token followed by * not given by the
                    // tokeniser
                    PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                    newToken.setEmpty().append(post.toString());
                    newToken.setOffset(index + 1, index + 1 + post.length());
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        @SuppressWarnings("resource")
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale,
                                MLAnalysisMode.EXACT_LANGUAGE);
                        Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            }

        }
    }

    // Put in real position increments as we treat them correctly

    int curentIncrement = -1;
    for (PackedTokenAttributeImpl c : list) {
        if (curentIncrement == -1) {
            curentIncrement = c.getPositionIncrement();
        } else if (c.getPositionIncrement() > 0) {
            curentIncrement = c.getPositionIncrement();
        } else {
            c.setPositionIncrement(curentIncrement);
        }
    }

    // Fix up position increments for in phrase isolated wildcards

    boolean lastWasWild = false;
    for (int i = 0; i < list.size() - 1; i++) {
        for (int j = list.get(i).endOffset() + 1; j < list.get(i + 1).startOffset() - 1; j++) {
            if (wildcardPoistions.contains(j)) {
                if (!lastWasWild) {
                    list.get(i + 1).setPositionIncrement(list.get(i + 1).getPositionIncrement() + 1);
                }
                lastWasWild = true;
            } else {
                lastWasWild = false;
            }
        }
    }

    Collections.sort(list, new Comparator<PackedTokenAttributeImpl>() {

        public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) {
            int dif = o1.startOffset() - o2.startOffset();
            return dif;

        }
    });

    // Combined * and ? based strings - should redo the tokeniser

    // Build tokens by position

    LinkedList<LinkedList<PackedTokenAttributeImpl>> tokensByPosition = new LinkedList<LinkedList<PackedTokenAttributeImpl>>();
    LinkedList<PackedTokenAttributeImpl> currentList = null;
    int lastStart = 0;
    for (PackedTokenAttributeImpl c : list) {
        if (c.startOffset() == lastStart) {
            if (currentList == null) {
                currentList = new LinkedList<PackedTokenAttributeImpl>();
                tokensByPosition.add(currentList);
            }
            currentList.add(c);
        } else {
            currentList = new LinkedList<PackedTokenAttributeImpl>();
            tokensByPosition.add(currentList);
            currentList.add(c);
        }
        lastStart = c.startOffset();
    }

    // Build all the token sequences and see which ones get strung together

    OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> allTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();
    for (LinkedList<PackedTokenAttributeImpl> tokensAtPosition : tokensByPosition) {
        OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> positionalSynonymSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();

        OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> newAllTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();

        FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (PackedTokenAttributeImpl t : tokensAtPosition) {
            PackedTokenAttributeImpl replace = new PackedTokenAttributeImpl();
            replace.setEmpty().append(t);
            replace.setOffset(t.startOffset(), t.endOffset());
            replace.setType(t.type());
            replace.setPositionIncrement(t.getPositionIncrement());

            boolean tokenFoundSequence = false;
            for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequencesSet) {
                LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
                newEntry.addAll(tokenSequence);
                if ((newEntry.getLast().endOffset() == replace.endOffset())
                        && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                    if ((newEntry.getLast().startOffset() == replace.startOffset())
                            && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) {
                        positionalSynonymSequencesSet.add(tokenSequence);
                        newEntry.add(replace);
                        tokenFoundSequence = true;
                    } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) {
                        if (newEntry.toString().endsWith(replace.toString())) {
                            // already in the gram
                            positionalSynonymSequencesSet.add(tokenSequence);
                            tokenFoundSequence = true;
                        } else {
                            // need to replace the synonym in the current
                            // gram
                            tokenFoundSequence = true;
                            StringBuffer old = new StringBuffer(newEntry.getLast().toString());
                            old.replace(replace.startOffset() - newEntry.getLast().startOffset(),
                                    replace.endOffset() - newEntry.getLast().startOffset(), replace.toString());
                            PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                            newToken.setEmpty().append(old.toString());
                            newToken.setOffset(newEntry.getLast().startOffset(),
                                    newEntry.getLast().endOffset());
                            newEntry.removeLast();
                            newEntry.add(newToken);
                        }
                    }
                } else if ((newEntry.getLast().startOffset() < replace.startOffset())
                        && (newEntry.getLast().endOffset() < replace.endOffset())) {
                    if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)
                            && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                        positionalSynonymSequencesSet.add(tokenSequence);
                    }
                    newEntry.add(replace);
                    tokenFoundSequence = true;
                }
                newAllTokenSequencesSet.add(newEntry);
            }
            if (false == tokenFoundSequence) {
                for (LinkedList<PackedTokenAttributeImpl> tokenSequence : newAllTokenSequencesSet) {
                    LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
                    newEntry.addAll(tokenSequence);
                    if ((newEntry.getLast().endOffset() == replace.endOffset())
                            && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                        if ((newEntry.getLast().startOffset() == replace.startOffset())
                                && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) {
                            positionalSynonymSequencesSet.add(tokenSequence);
                            newEntry.add(replace);
                            tokenFoundSequence = true;
                        } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) {
                            if (newEntry.toString().endsWith(replace.toString())) {
                                // already in the gram
                                positionalSynonymSequencesSet.add(tokenSequence);
                                tokenFoundSequence = true;
                            } else {
                                // need to replace the synonym in the
                                // current gram
                                tokenFoundSequence = true;
                                StringBuffer old = new StringBuffer(newEntry.getLast().toString());
                                old.replace(replace.startOffset() - newEntry.getLast().startOffset(),
                                        replace.endOffset() - newEntry.getLast().startOffset(),
                                        replace.toString());
                                PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                                newToken.setEmpty().append(old.toString());
                                newToken.setOffset(newEntry.getLast().startOffset(),
                                        newEntry.getLast().endOffset());
                                newEntry.removeLast();
                                newEntry.add(newToken);
                                positionalSynonymSequencesSet.add(newEntry);
                            }
                        }
                    } else if ((newEntry.getLast().startOffset() < replace.startOffset())
                            && (newEntry.getLast().endOffset() < replace.endOffset())) {
                        if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)
                                && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                            positionalSynonymSequencesSet.add(tokenSequence);
                            newEntry.add(replace);
                            tokenFoundSequence = true;
                        }
                    }
                }
            }
            if (false == tokenFoundSequence) {
                LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
                newEntry.add(replace);
                newAllTokenSequencesSet.add(newEntry);
            }
            // Limit the max number of permutations we consider
            if (newAllTokenSequencesSet.size() > 64) {
                break FOR_FIRST_TOKEN_AT_POSITION_ONLY;
            }
        }
        allTokenSequencesSet = newAllTokenSequencesSet;
        allTokenSequencesSet.addAll(positionalSynonymSequencesSet);

    }

    LinkedList<LinkedList<PackedTokenAttributeImpl>> allTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(
            allTokenSequencesSet);

    // build the unique

    LinkedList<LinkedList<PackedTokenAttributeImpl>> fixedTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>();
    for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequences) {
        LinkedList<PackedTokenAttributeImpl> fixedTokenSequence = new LinkedList<PackedTokenAttributeImpl>();
        fixedTokenSequences.add(fixedTokenSequence);
        PackedTokenAttributeImpl replace = null;
        for (PackedTokenAttributeImpl c : tokenSequence) {
            if (replace == null) {
                StringBuilder prefix = new StringBuilder();
                for (int i = c.startOffset() - 1; i >= 0; i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        prefix.insert(0, test);
                    } else {
                        break;
                    }
                }
                String pre = prefix.toString();
                if (requiresMLTokenDuplication) {
                    String termText = c.toString();
                    int position = termText.indexOf("}");
                    String language = termText.substring(0, position + 1);
                    String token = termText.substring(position + 1);
                    replace = new PackedTokenAttributeImpl();
                    replace.setEmpty().append(language + pre + token);
                    replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                } else {
                    String termText = c.toString();
                    replace = new PackedTokenAttributeImpl();
                    replace.setEmpty().append(pre + termText);
                    replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                }
            } else {
                StringBuilder prefix = new StringBuilder();
                StringBuilder postfix = new StringBuilder();
                StringBuilder builder = prefix;
                for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        builder.insert(0, test);
                    } else {
                        builder = postfix;
                        postfix.setLength(0);
                    }
                }
                String pre = prefix.toString();
                String post = postfix.toString();

                // Does it bridge?
                if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) {
                    String termText = c.toString();
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        @SuppressWarnings("unused")
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = replace.toString();
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(replaceTermText + pre + token);
                        replace.setOffset(replace.startOffset(), c.endOffset());
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    } else {
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = replace.toString();
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(replaceTermText + pre + termText);
                        replace.setOffset(replace.startOffset(), c.endOffset());
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    }
                } else {
                    String termText = c.toString();
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        String replaceTermText = replace.toString();
                        PackedTokenAttributeImpl last = new PackedTokenAttributeImpl();
                        last.setEmpty().append(replaceTermText + post);
                        last.setOffset(replace.startOffset(), replace.endOffset() + post.length());
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(language + pre + token);
                        replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    } else {
                        String replaceTermText = replace.toString();
                        PackedTokenAttributeImpl last = new PackedTokenAttributeImpl();
                        last.setEmpty().append(replaceTermText + post);
                        last.setOffset(replace.startOffset(), replace.endOffset() + post.length());
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(pre + termText);
                        replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    }
                }
            }
        }
        // finish last
        if (replace != null) {
            StringBuilder postfix = new StringBuilder();
            if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) {
                for (int i = replace.endOffset(); i < testText.length(); i++) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        postfix.append(test);
                    } else {
                        break;
                    }
                }
            }
            String post = postfix.toString();
            int oldPositionIncrement = replace.getPositionIncrement();
            String replaceTermText = replace.toString();
            PackedTokenAttributeImpl terminal = new PackedTokenAttributeImpl();
            terminal.setEmpty().append(replaceTermText + post);
            terminal.setOffset(replace.startOffset(), replace.endOffset() + post.length());
            terminal.setType(replace.type());
            terminal.setPositionIncrement(oldPositionIncrement);
            fixedTokenSequence.add(terminal);
        }
    }

    // rebuild fixed list

    ArrayList<PackedTokenAttributeImpl> fixed = new ArrayList<PackedTokenAttributeImpl>();
    for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
        for (PackedTokenAttributeImpl token : tokenSequence) {
            fixed.add(token);
        }
    }

    // reorder by start position and increment

    Collections.sort(fixed, new Comparator<PackedTokenAttributeImpl>() {

        public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) {
            int dif = o1.startOffset() - o2.startOffset();
            if (dif != 0) {
                return dif;
            } else {
                return o1.getPositionIncrement() - o2.getPositionIncrement();
            }
        }
    });

    // make sure we remove any tokens we have duplicated

    @SuppressWarnings("rawtypes")
    OrderedHashSet unique = new OrderedHashSet();
    unique.addAll(fixed);
    fixed = new ArrayList<PackedTokenAttributeImpl>(unique);

    list = fixed;

    // add any missing locales back to the tokens

    if (localePrefix.length() > 0) {
        for (int j = 0; j < list.size(); j++) {
            PackedTokenAttributeImpl currentToken = list.get(j);
            String termText = currentToken.toString();
            currentToken.setEmpty();
            currentToken.append(localePrefix + termText);
        }
    }

    SchemaField sf = schema.getField(field);

    boolean isShingled = false;
    @SuppressWarnings("resource")
    TokenizerChain tokenizerChain = (sf.getType().getQueryAnalyzer() instanceof TokenizerChain)
            ? ((TokenizerChain) sf.getType().getQueryAnalyzer())
            : null;
    if (tokenizerChain != null) {
        for (TokenFilterFactory factory : tokenizerChain.getTokenFilterFactories()) {
            if (factory instanceof ShingleFilterFactory) {
                isShingled = true;
                break;
            }
        }
    }
    @SuppressWarnings("resource")
    AlfrescoAnalyzerWrapper analyzerWrapper = (sf.getType()
            .getQueryAnalyzer() instanceof AlfrescoAnalyzerWrapper)
                    ? ((AlfrescoAnalyzerWrapper) sf.getType().getQueryAnalyzer())
                    : null;
    if (analyzerWrapper != null) {
        // assume if there are no term positions it is shingled ....
        isShingled = true;
    }

    boolean forceConjuncion = rerankPhase == RerankPhase.QUERY_PHASE;

    if (list.size() == 0) {
        return null;
    } else if (list.size() == 1) {
        nextToken = list.get(0);
        String termText = nextToken.toString();
        if (!isNumeric && (termText.contains("*") || termText.contains("?"))) {
            return newWildcardQuery(new Term(field, termText));
        } else {
            return newTermQuery(new Term(field, termText));
        }
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                Builder q = newBooleanQuery();
                for (int i = 0; i < list.size(); i++) {
                    Query currentQuery;
                    nextToken = list.get(i);
                    String termText = nextToken.toString();
                    if (termText.contains("*") || termText.contains("?")) {
                        currentQuery = newWildcardQuery(new Term(field, termText));
                    } else {
                        currentQuery = newTermQuery(new Term(field, termText));
                    }
                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }
                return q.build();
            } else if (forceConjuncion) {
                BooleanQuery.Builder or = new BooleanQuery.Builder();

                for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
                    BooleanQuery.Builder and = new BooleanQuery.Builder();
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i);
                        String termText = nextToken.toString();

                        Term term = new Term(field, termText);
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            and.add(wildQuery, Occur.MUST);
                        } else {
                            TermQuery termQuery = new TermQuery(term);
                            and.add(termQuery, Occur.MUST);
                        }
                    }
                    if (and.build().clauses().size() > 0) {
                        or.add(and.build(), Occur.SHOULD);
                    }
                }
                return or.build();
            }
            // shingle
            else if (sf.omitPositions() && isShingled) {

                ArrayList<PackedTokenAttributeImpl> nonContained = getNonContained(list);
                Query currentQuery;

                BooleanQuery.Builder weakPhrase = new BooleanQuery.Builder();
                for (PackedTokenAttributeImpl shingleToken : nonContained) {
                    String termText = shingleToken.toString();
                    Term term = new Term(field, termText);

                    if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                        currentQuery = new org.apache.lucene.search.WildcardQuery(term);
                    } else {
                        currentQuery = new TermQuery(term);
                    }
                    weakPhrase.add(currentQuery, Occur.MUST);
                }

                return weakPhrase.build();

            }
            // Word delimiter factory and other odd things generate complex
            // token patterns
            // Smart skip token sequences with small tokens that generate
            // toomany wildcards
            // Fall back to the larger pattern
            // e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches
            // too much (S ite1*) and (Site1*) will still be OK
            // If we skip all (for just 1* in the input) this is still an
            // issue.
            else {

                return generateSpanOrQuery(field, fixedTokenSequences);

            }
        } else {
            if (forceConjuncion) {
                BooleanQuery.Builder or = new BooleanQuery.Builder();

                for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
                    BooleanQuery.Builder and = new BooleanQuery.Builder();
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i);
                        String termText = nextToken.toString();

                        Term term = new Term(field, termText);
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            and.add(wildQuery, Occur.MUST);
                        } else {
                            TermQuery termQuery = new TermQuery(term);
                            and.add(termQuery, Occur.MUST);
                        }
                    }
                    if (and.build().clauses().size() > 0) {
                        or.add(and.build(), Occur.SHOULD);
                    }
                }
                return or.build();
            } else {
                SpanQuery spanQuery = null;
                ArrayList<SpanQuery> atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                int gap = 0;
                for (int i = 0; i < list.size(); i++) {
                    nextToken = list.get(i);
                    String termText = nextToken.toString();
                    Term term = new Term(field, termText);
                    if (getEnablePositionIncrements()) {
                        SpanQuery nextSpanQuery;
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>(
                                    wildQuery);
                            wrapper.setRewriteMethod(
                                    new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit));
                            nextSpanQuery = wrapper;
                        } else {
                            nextSpanQuery = new SpanTermQuery(term);
                        }
                        if (gap == 0) {
                            atSamePositionSpanOrQueryParts.add(nextSpanQuery);
                        } else {
                            if (atSamePositionSpanOrQueryParts.size() == 0) {
                                if (spanQuery == null) {
                                    spanQuery = nextSpanQuery;
                                } else {
                                    spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, nextSpanQuery },
                                            (gap - 1) + internalSlop, internalSlop < 2);
                                }
                                atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                            } else if (atSamePositionSpanOrQueryParts.size() == 1) {
                                if (spanQuery == null) {
                                    spanQuery = atSamePositionSpanOrQueryParts.get(0);
                                } else {
                                    spanQuery = new SpanNearQuery(
                                            new SpanQuery[] { spanQuery,
                                                    atSamePositionSpanOrQueryParts.get(0) },
                                            (gap - 1) + internalSlop, internalSlop < 2);
                                }
                                atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                                atSamePositionSpanOrQueryParts.add(nextSpanQuery);
                            } else {
                                if (spanQuery == null) {
                                    spanQuery = new SpanOrQuery(
                                            atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {}));
                                } else {
                                    spanQuery = new SpanNearQuery(
                                            new SpanQuery[] { spanQuery,
                                                    new SpanOrQuery(atSamePositionSpanOrQueryParts
                                                            .toArray(new SpanQuery[] {})) },
                                            (gap - 1) + internalSlop, internalSlop < 2);
                                }
                                atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                                atSamePositionSpanOrQueryParts.add(nextSpanQuery);
                            }
                        }
                        gap = nextToken.getPositionIncrement();
                    } else {
                        SpanQuery nextSpanQuery;
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>(
                                    wildQuery);
                            wrapper.setRewriteMethod(
                                    new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit));
                            nextSpanQuery = wrapper;
                        } else {
                            nextSpanQuery = new SpanTermQuery(term);
                        }
                        if (spanQuery == null) {
                            spanQuery = new SpanOrQuery(nextSpanQuery);
                        } else {
                            spanQuery = new SpanOrQuery(spanQuery, nextSpanQuery);
                        }
                    }
                }
                if (atSamePositionSpanOrQueryParts.size() == 0) {
                    return spanQuery;
                } else if (atSamePositionSpanOrQueryParts.size() == 1) {
                    if (spanQuery == null) {
                        spanQuery = atSamePositionSpanOrQueryParts.get(0);
                    } else {
                        spanQuery = new SpanNearQuery(
                                new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) },
                                (gap - 1) + internalSlop, internalSlop < 2);
                    }
                    return spanQuery;
                } else {
                    if (spanQuery == null) {
                        spanQuery = new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {}));
                    } else {
                        spanQuery = new SpanNearQuery(
                                new SpanQuery[] { spanQuery,
                                        new SpanOrQuery(
                                                atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})) },
                                (gap - 1) + internalSlop, internalSlop < 2);
                    }
                    return spanQuery;
                }
            }
        }
    }
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

private String getFirstTokenForRange(String string, FieldInstance field) throws IOException {
    PackedTokenAttributeImpl nextToken;/* w  ww .j  a v  a2  s.c  om*/
    TokenStream source = null;
    ;

    try {
        source = getAnalyzer().tokenStream(field.getField(), new StringReader(string));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }

            return nextToken.toString();
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }
    return null;
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

/**
 * @param first/* w  w  w  .j  a va 2 s.  com*/
 * @param field
 * @return SpanOrQuery
 * @throws IOException
 */
private SpanQuery buildSpanOrQuery(String first, FieldInstance field) throws IOException {
    ArrayList<SpanQuery> spanOrQueryParts = new ArrayList<SpanQuery>();

    PackedTokenAttributeImpl nextToken;
    TokenStream source = null;

    try {
        source = getAnalyzer().tokenStream(field.getField(), new StringReader(first));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }

            SpanQuery termQuery = new SpanTermQuery(new Term(field.getField(), nextToken.toString()));
            spanOrQueryParts.add(termQuery);
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }

    if (spanOrQueryParts.size() == 1) {
        return spanOrQueryParts.get(0);
    } else {
        return new SpanOrQuery(spanOrQueryParts.toArray(new SpanQuery[] {}));
    }
}

From source file:org.alfresco.solr.SolrInformationServer.java

License:Open Source License

private void addContentPropertyToDocUsingAlfrescoRepository(SolrInputDocument doc, QName propertyQName,
        long dbId, String locale) throws AuthenticationException, IOException {
    long start = System.nanoTime();

    // Expensive call to be done with ContentTracker
    GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null);

    addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_STATUS,
            response);/* w w w .j av  a2s  .c o m*/
    addContentPropertyMetadata(doc, propertyQName,
            AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_EXCEPTION, response);
    addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_TIME,
            response);

    InputStream ris = response.getContent();
    String textContent = "";
    try {
        if (ris != null) {
            // Get and copy content
            byte[] bytes = FileCopyUtils.copyToByteArray(new BoundedInputStream(ris, contentStreamLimit));
            textContent = new String(bytes, StandardCharsets.UTF_8);
        }
    } finally {
        // release the response only when the content has been read
        response.release();
    }

    if (minHash && textContent.length() > 0) {
        Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
        TokenStream ts = analyzer.tokenStream("min_hash", textContent);
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            StringBuilder tokenBuff = new StringBuilder();
            char[] buff = termAttribute.buffer();

            for (int i = 0; i < termAttribute.length(); i++) {
                tokenBuff.append(Integer.toHexString(buff[i]));
            }
            doc.addField(FINGERPRINT_FIELD, tokenBuff.toString());

        }
        ts.end();
        ts.close();
    }

    long end = System.nanoTime();
    this.getTrackerStats().addDocTransformationTime(end - start);

    StringBuilder builder = new StringBuilder(textContent.length() + 16);
    builder.append("\u0000").append(locale).append("\u0000");
    builder.append(textContent);
    String localisedText = builder.toString();

    for (FieldInstance field : AlfrescoSolrDataModel.getInstance()
            .getIndexedFieldNamesForProperty(propertyQName).getFields()) {
        doc.removeField(field.getField());
        if (field.isLocalised()) {
            doc.addField(field.getField(), localisedText);
        } else {
            doc.addField(field.getField(), textContent);
        }
        addFieldIfNotSet(doc, field);
    }
}

From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java

License:Apache License

protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
    TokenStream source;

    if (analyzerIn == null)
        analyzerIn = analyzer;// w  w w.  j  a  va  2s .c o m

    try {
        source = analyzerIn.tokenStream(field, new StringReader(part));
        source.reset();
    } catch (IOException e) {
        throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
    }

    TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
    BytesRef bytes = termAtt.getBytesRef();

    try {
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
    } catch (IOException e) {
        throw new RuntimeException("error analyzing range part: " + part, e);
    }

    try {
        source.end();
        source.close();
    } catch (IOException e) {
        throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part,
                e);
    }

    return BytesRef.deepCopyOf(bytes);
}

From source file:org.apache.jackrabbit.core.query.lucene.AbstractExcerpt.java

License:Apache License

/**
 * @param text the text.//from  ww w .j  ava 2 s  .c  om
 * @return a <code>TermPositionVector</code> for the given text.
 */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            TermAttribute term = ts.getAttribute(TermAttribute.class);
            String termText = term.term();
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        // should never happen, we are reading from a string
    }

    return new TermPositionVector() {

        private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}

From source file:org.apache.jackrabbit.core.query.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *///from ww  w.  ja  v  a2s  .  c o m
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    while (ts.incrementToken()) {
        TermAttribute term = ts.getAttribute(TermAttribute.class);
        String word = term.term();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.classifier.NewsgroupHelper.java

License:Apache License

public static void countWords(Analyzer analyzer, Collection<String> words, Reader in,
        Multiset<String> overallCounts) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();//w w  w . j  a v a2s  .co  m
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
    ts.end();
    Closeables.close(ts, true);
}