Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:nl.uva.sne.commons.SemanticUtils.java

public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException {
    text = text.replaceAll("", "'");

    text = text.replaceAll("_", " ");
    text = text.replaceAll("[0-9]", "");
    text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " ");

    text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
    text = text.toLowerCase();/*  w w  w.j  a va  2s.c  om*/

    TokenStream tokenStream;
    if (stem) {
        tokenStream = tokenStemStream("field", new StringReader(text));
    } else {
        tokenStream = tokenStream("field", new StringReader(text));
    }

    ArrayList<String> words = new ArrayList<>();
    try {
        CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            words.add(term.toString());
        }
        tokenStream.end();
    } finally {
        tokenStream.close();
    }
    //        Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens");
    return words;
}

From source file:org.aksw.palmetto.corpus.lucene.SimpleAnalyzerTest.java

License:Open Source License

public void test(boolean lowercase) throws Exception {
    SimpleAnalyzer analyzer = new SimpleAnalyzer(lowercase);
    TokenStream stream = analyzer.tokenStream("test", text);

    CharTermAttribute token;/*from  www.  ja  va  2s  .c o  m*/
    int count = 0;
    stream.reset();
    while (stream.incrementToken()) {
        Assert.assertTrue(count < expectedTokens.length);
        token = stream.getAttribute(CharTermAttribute.class);
        if (lowercase) {
            Assert.assertEquals(expectedTokens[count].toLowerCase(), token.toString());
        } else {
            Assert.assertEquals(expectedTokens[count], token.toString());
        }
        ++count;
    }
    Assert.assertEquals(expectedTokens.length, count);
    analyzer.close();
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.MLAnalayserTest.java

License:Open Source License

/**
 * Check that the TokenStream yields the exact tokens specified.
 * Note that order is not checked, since the map of locales will not provide a
 * predictable ordering when enumerated.
 * //from  w w w  . j  a v a 2 s .  com
 * The expected list of tokens may contain the same token more than once and
 * the number of instances will have to match the number found in the stream.
 * 
 * @param ts              TokenStream to inspect.
 * @param expectedTokens  List of tokens in the order expected from the stream.
 * @throws IOException
 */
private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException {
    final int expectedCount = expectedTokens.size();
    int count = 0;

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            count++;
            System.out.println("Token: " + termAtt.toString());
            if (expectedTokens.contains(termAtt.toString())) {
                // remove an instance of the term text so that it is not matched again
                expectedTokens.remove(termAtt.toString());
            } else {
                fail("Unexpected token: " + termAtt.toString());
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    assertEquals("Incorrect number of tokens generated.", expectedCount, count);
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java

License:Open Source License

private void tokenise(TokenStream ts, String[] tokens) throws IOException {
    int i = 0;/*  w  w  w.j  av a2 s .c  o m*/

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println("token: " + ts.reflectAsString(true));

            String termText = termAtt.toString();

            if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) {
                assert (i % 2 == 1);
                assertEquals(termText, tokens[i++]);
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    if (i != tokens.length) {
        fail("Invalid number of tokens, found " + i + " and expected " + tokens.length);
    }
}

From source file:org.alfresco.solr.AlfrescoFieldType.java

License:Open Source License

public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null)
        return null;

    TokenStream source = null;
    try {/*from w w w .j  av  a2s.  c o  m*/
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned no terms for multiTerm term: " + part);
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned too many terms for multiTerm term: " + part);

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

private ArrayList<String> getTokens(IndexableField indexableField) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();

    TokenStream ts = indexableField.tokenStream(schema.getIndexAnalyzer(), null);
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);//from w w  w  .ja  v a2s .  c o m
    }
    ts.end();
    ts.close();

    return tokens;
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

@SuppressWarnings("unchecked")
protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode,
        LuceneFunction luceneFunction) throws ParseException, IOException {
    // make sure the field exists or return a dummy query so we have no
    // error ....ACE-3231
    SchemaField schemaField = schema.getFieldOrNull(field);
    boolean isNumeric = false;
    if (schemaField == null) {
        return new TermQuery(new Term("_dummy_", "_miss_"));
    } else {/*ww w.j  a  v  a  2 s  . c o  m*/
        isNumeric = (schemaField.getType().getNumericType() != null);
        if (isNumeric) {
            //Check to see if queryText is numeric or else it will fail.
            try {
                Double.valueOf(queryText);
            } catch (NumberFormatException e) {
                return new TermQuery(new Term("_dummy_", "_miss_"));
            }
        }
    }

    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or noth

    // TODO: Untokenised columns with functions require special handling

    if (luceneFunction != LuceneFunction.FIELD) {
        throw new UnsupportedOperationException(
                "Field queries are not supported on lucene functions (UPPER, LOWER, etc)");
    }

    // if the incoming string already has a language identifier we strip it
    // iff and addit back on again

    String localePrefix = "";

    String toTokenise = queryText;

    if (queryText.startsWith("{")) {
        int position = queryText.indexOf("}");
        if (position > 0) {
            String language = queryText.substring(0, position + 1);
            Locale locale = new Locale(queryText.substring(1, position));
            String token = queryText.substring(position + 1);
            boolean found = false;
            for (Locale current : Locale.getAvailableLocales()) {
                if (current.toString().equalsIgnoreCase(locale.toString())) {
                    found = true;
                    break;
                }
            }
            if (found) {
                localePrefix = language;
                toTokenise = token;
            } else {
                // toTokenise = token;
            }
        }
    }

    String testText = toTokenise;
    boolean requiresMLTokenDuplication = false;
    String localeString = null;
    if (isPropertyField(field) && (localePrefix.length() == 0)) {
        if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) {
            int position = queryText.indexOf("\u0000", 1);
            testText = queryText.substring(position + 1);
            requiresMLTokenDuplication = true;
            localeString = queryText.substring(1, position);

        }
    }

    // find the positions of any escaped * and ? and ignore them

    Set<Integer> wildcardPoistions = getWildcardPositions(testText);

    TokenStream source = null;
    ArrayList<PackedTokenAttributeImpl> list = new ArrayList<PackedTokenAttributeImpl>();
    boolean severalTokensAtSamePosition = false;
    PackedTokenAttributeImpl nextToken;
    int positionCount = 0;

    try {
        source = getAnalyzer().tokenStream(field, new StringReader(toTokenise));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }

            list.add(nextToken);
            if (nextToken.getPositionIncrement() != 0)
                positionCount += nextToken.getPositionIncrement();
            else
                severalTokensAtSamePosition = true;
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }

    // add any alpha numeric wildcards that have been missed
    // Fixes most stop word and wild card issues

    for (int index = 0; index < testText.length(); index++) {
        char current = testText.charAt(index);
        if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) {
            StringBuilder pre = new StringBuilder(10);
            if (index == 0) {
                // "*" and "?" at the start

                boolean found = false;
                for (int j = 0; j < list.size(); j++) {
                    PackedTokenAttributeImpl test = list.get(j);
                    if ((test.startOffset() <= 0) && (0 < test.endOffset())) {
                        found = true;
                        break;
                    }
                }
                if (!found && (list.size() == 0)) {
                    // Add new token followed by * not given by the
                    // tokeniser
                    PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                    newToken.setEmpty().append("", 0, 0);
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        @SuppressWarnings("resource")
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale,
                                MLAnalysisMode.EXACT_LANGUAGE);
                        Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            } else if (index > 0) {
                // Add * and ? back into any tokens from which it has been
                // removed

                boolean tokenFound = false;
                for (int j = 0; j < list.size(); j++) {
                    PackedTokenAttributeImpl test = list.get(j);
                    if ((test.startOffset() <= index) && (index < test.endOffset())) {
                        if (requiresMLTokenDuplication) {
                            String termText = test.toString();
                            int position = termText.indexOf("}");
                            String language = termText.substring(0, position + 1);
                            String token = termText.substring(position + 1);
                            if (index >= test.startOffset() + token.length()) {
                                test.setEmpty();
                                test.append(language + token + current);
                            }
                        } else {
                            if (index >= test.startOffset() + test.length()) {
                                test.setEmpty();
                                test.append(test.toString() + current);
                            }
                        }
                        tokenFound = true;
                        break;
                    }
                }

                if (!tokenFound) {
                    for (int i = index - 1; i >= 0; i--) {
                        char c = testText.charAt(i);
                        if (Character.isLetterOrDigit(c)) {
                            boolean found = false;
                            for (int j = 0; j < list.size(); j++) {
                                PackedTokenAttributeImpl test = list.get(j);
                                if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                    found = true;
                                    break;
                                }
                            }
                            if (found) {
                                break;
                            } else {
                                pre.insert(0, c);
                            }
                        } else {
                            break;
                        }
                    }
                    if (pre.length() > 0) {
                        // Add new token followed by * not given by the
                        // tokeniser
                        PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                        newToken.setEmpty().append(pre.toString());
                        newToken.setOffset(index - pre.length(), index);
                        newToken.setType("ALPHANUM");
                        if (requiresMLTokenDuplication) {
                            Locale locale = I18NUtil.parseLocale(localeString);
                            @SuppressWarnings("resource")
                            MLTokenDuplicator duplicator = new MLTokenDuplicator(locale,
                                    MLAnalysisMode.EXACT_LANGUAGE);
                            Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
                            if (it != null) {
                                int count = 0;
                                while (it.hasNext()) {
                                    list.add(it.next());
                                    count++;
                                    if (count > 1) {
                                        severalTokensAtSamePosition = true;
                                    }
                                }
                            }
                        }
                        // content
                        else {
                            list.add(newToken);
                        }
                    }
                }
            }

            StringBuilder post = new StringBuilder(10);
            if (index > 0) {
                for (int i = index + 1; i < testText.length(); i++) {
                    char c = testText.charAt(i);
                    if (Character.isLetterOrDigit(c)) {
                        boolean found = false;
                        for (int j = 0; j < list.size(); j++) {
                            PackedTokenAttributeImpl test = list.get(j);
                            if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                found = true;
                                break;
                            }
                        }
                        if (found) {
                            break;
                        } else {
                            post.append(c);
                        }
                    } else {
                        break;
                    }
                }
                if (post.length() > 0) {
                    // Add new token followed by * not given by the
                    // tokeniser
                    PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                    newToken.setEmpty().append(post.toString());
                    newToken.setOffset(index + 1, index + 1 + post.length());
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        @SuppressWarnings("resource")
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale,
                                MLAnalysisMode.EXACT_LANGUAGE);
                        Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            }

        }
    }

    // Put in real position increments as we treat them correctly

    int curentIncrement = -1;
    for (PackedTokenAttributeImpl c : list) {
        if (curentIncrement == -1) {
            curentIncrement = c.getPositionIncrement();
        } else if (c.getPositionIncrement() > 0) {
            curentIncrement = c.getPositionIncrement();
        } else {
            c.setPositionIncrement(curentIncrement);
        }
    }

    // Fix up position increments for in phrase isolated wildcards

    boolean lastWasWild = false;
    for (int i = 0; i < list.size() - 1; i++) {
        for (int j = list.get(i).endOffset() + 1; j < list.get(i + 1).startOffset() - 1; j++) {
            if (wildcardPoistions.contains(j)) {
                if (!lastWasWild) {
                    list.get(i + 1).setPositionIncrement(list.get(i + 1).getPositionIncrement() + 1);
                }
                lastWasWild = true;
            } else {
                lastWasWild = false;
            }
        }
    }

    Collections.sort(list, new Comparator<PackedTokenAttributeImpl>() {

        public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) {
            int dif = o1.startOffset() - o2.startOffset();
            return dif;

        }
    });

    // Combined * and ? based strings - should redo the tokeniser

    // Build tokens by position

    LinkedList<LinkedList<PackedTokenAttributeImpl>> tokensByPosition = new LinkedList<LinkedList<PackedTokenAttributeImpl>>();
    LinkedList<PackedTokenAttributeImpl> currentList = null;
    int lastStart = 0;
    for (PackedTokenAttributeImpl c : list) {
        if (c.startOffset() == lastStart) {
            if (currentList == null) {
                currentList = new LinkedList<PackedTokenAttributeImpl>();
                tokensByPosition.add(currentList);
            }
            currentList.add(c);
        } else {
            currentList = new LinkedList<PackedTokenAttributeImpl>();
            tokensByPosition.add(currentList);
            currentList.add(c);
        }
        lastStart = c.startOffset();
    }

    // Build all the token sequences and see which ones get strung together

    OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> allTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();
    for (LinkedList<PackedTokenAttributeImpl> tokensAtPosition : tokensByPosition) {
        OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> positionalSynonymSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();

        OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> newAllTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();

        FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (PackedTokenAttributeImpl t : tokensAtPosition) {
            PackedTokenAttributeImpl replace = new PackedTokenAttributeImpl();
            replace.setEmpty().append(t);
            replace.setOffset(t.startOffset(), t.endOffset());
            replace.setType(t.type());
            replace.setPositionIncrement(t.getPositionIncrement());

            boolean tokenFoundSequence = false;
            for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequencesSet) {
                LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
                newEntry.addAll(tokenSequence);
                if ((newEntry.getLast().endOffset() == replace.endOffset())
                        && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                    if ((newEntry.getLast().startOffset() == replace.startOffset())
                            && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) {
                        positionalSynonymSequencesSet.add(tokenSequence);
                        newEntry.add(replace);
                        tokenFoundSequence = true;
                    } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) {
                        if (newEntry.toString().endsWith(replace.toString())) {
                            // already in the gram
                            positionalSynonymSequencesSet.add(tokenSequence);
                            tokenFoundSequence = true;
                        } else {
                            // need to replace the synonym in the current
                            // gram
                            tokenFoundSequence = true;
                            StringBuffer old = new StringBuffer(newEntry.getLast().toString());
                            old.replace(replace.startOffset() - newEntry.getLast().startOffset(),
                                    replace.endOffset() - newEntry.getLast().startOffset(), replace.toString());
                            PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                            newToken.setEmpty().append(old.toString());
                            newToken.setOffset(newEntry.getLast().startOffset(),
                                    newEntry.getLast().endOffset());
                            newEntry.removeLast();
                            newEntry.add(newToken);
                        }
                    }
                } else if ((newEntry.getLast().startOffset() < replace.startOffset())
                        && (newEntry.getLast().endOffset() < replace.endOffset())) {
                    if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)
                            && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                        positionalSynonymSequencesSet.add(tokenSequence);
                    }
                    newEntry.add(replace);
                    tokenFoundSequence = true;
                }
                newAllTokenSequencesSet.add(newEntry);
            }
            if (false == tokenFoundSequence) {
                for (LinkedList<PackedTokenAttributeImpl> tokenSequence : newAllTokenSequencesSet) {
                    LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
                    newEntry.addAll(tokenSequence);
                    if ((newEntry.getLast().endOffset() == replace.endOffset())
                            && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                        if ((newEntry.getLast().startOffset() == replace.startOffset())
                                && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) {
                            positionalSynonymSequencesSet.add(tokenSequence);
                            newEntry.add(replace);
                            tokenFoundSequence = true;
                        } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) {
                            if (newEntry.toString().endsWith(replace.toString())) {
                                // already in the gram
                                positionalSynonymSequencesSet.add(tokenSequence);
                                tokenFoundSequence = true;
                            } else {
                                // need to replace the synonym in the
                                // current gram
                                tokenFoundSequence = true;
                                StringBuffer old = new StringBuffer(newEntry.getLast().toString());
                                old.replace(replace.startOffset() - newEntry.getLast().startOffset(),
                                        replace.endOffset() - newEntry.getLast().startOffset(),
                                        replace.toString());
                                PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                                newToken.setEmpty().append(old.toString());
                                newToken.setOffset(newEntry.getLast().startOffset(),
                                        newEntry.getLast().endOffset());
                                newEntry.removeLast();
                                newEntry.add(newToken);
                                positionalSynonymSequencesSet.add(newEntry);
                            }
                        }
                    } else if ((newEntry.getLast().startOffset() < replace.startOffset())
                            && (newEntry.getLast().endOffset() < replace.endOffset())) {
                        if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)
                                && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                            positionalSynonymSequencesSet.add(tokenSequence);
                            newEntry.add(replace);
                            tokenFoundSequence = true;
                        }
                    }
                }
            }
            if (false == tokenFoundSequence) {
                LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
                newEntry.add(replace);
                newAllTokenSequencesSet.add(newEntry);
            }
            // Limit the max number of permutations we consider
            if (newAllTokenSequencesSet.size() > 64) {
                break FOR_FIRST_TOKEN_AT_POSITION_ONLY;
            }
        }
        allTokenSequencesSet = newAllTokenSequencesSet;
        allTokenSequencesSet.addAll(positionalSynonymSequencesSet);

    }

    LinkedList<LinkedList<PackedTokenAttributeImpl>> allTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(
            allTokenSequencesSet);

    // build the unique

    LinkedList<LinkedList<PackedTokenAttributeImpl>> fixedTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>();
    for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequences) {
        LinkedList<PackedTokenAttributeImpl> fixedTokenSequence = new LinkedList<PackedTokenAttributeImpl>();
        fixedTokenSequences.add(fixedTokenSequence);
        PackedTokenAttributeImpl replace = null;
        for (PackedTokenAttributeImpl c : tokenSequence) {
            if (replace == null) {
                StringBuilder prefix = new StringBuilder();
                for (int i = c.startOffset() - 1; i >= 0; i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        prefix.insert(0, test);
                    } else {
                        break;
                    }
                }
                String pre = prefix.toString();
                if (requiresMLTokenDuplication) {
                    String termText = c.toString();
                    int position = termText.indexOf("}");
                    String language = termText.substring(0, position + 1);
                    String token = termText.substring(position + 1);
                    replace = new PackedTokenAttributeImpl();
                    replace.setEmpty().append(language + pre + token);
                    replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                } else {
                    String termText = c.toString();
                    replace = new PackedTokenAttributeImpl();
                    replace.setEmpty().append(pre + termText);
                    replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                }
            } else {
                StringBuilder prefix = new StringBuilder();
                StringBuilder postfix = new StringBuilder();
                StringBuilder builder = prefix;
                for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        builder.insert(0, test);
                    } else {
                        builder = postfix;
                        postfix.setLength(0);
                    }
                }
                String pre = prefix.toString();
                String post = postfix.toString();

                // Does it bridge?
                if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) {
                    String termText = c.toString();
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        @SuppressWarnings("unused")
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = replace.toString();
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(replaceTermText + pre + token);
                        replace.setOffset(replace.startOffset(), c.endOffset());
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    } else {
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = replace.toString();
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(replaceTermText + pre + termText);
                        replace.setOffset(replace.startOffset(), c.endOffset());
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    }
                } else {
                    String termText = c.toString();
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        String replaceTermText = replace.toString();
                        PackedTokenAttributeImpl last = new PackedTokenAttributeImpl();
                        last.setEmpty().append(replaceTermText + post);
                        last.setOffset(replace.startOffset(), replace.endOffset() + post.length());
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(language + pre + token);
                        replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    } else {
                        String replaceTermText = replace.toString();
                        PackedTokenAttributeImpl last = new PackedTokenAttributeImpl();
                        last.setEmpty().append(replaceTermText + post);
                        last.setOffset(replace.startOffset(), replace.endOffset() + post.length());
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(pre + termText);
                        replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    }
                }
            }
        }
        // finish last
        if (replace != null) {
            StringBuilder postfix = new StringBuilder();
            if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) {
                for (int i = replace.endOffset(); i < testText.length(); i++) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        postfix.append(test);
                    } else {
                        break;
                    }
                }
            }
            String post = postfix.toString();
            int oldPositionIncrement = replace.getPositionIncrement();
            String replaceTermText = replace.toString();
            PackedTokenAttributeImpl terminal = new PackedTokenAttributeImpl();
            terminal.setEmpty().append(replaceTermText + post);
            terminal.setOffset(replace.startOffset(), replace.endOffset() + post.length());
            terminal.setType(replace.type());
            terminal.setPositionIncrement(oldPositionIncrement);
            fixedTokenSequence.add(terminal);
        }
    }

    // rebuild fixed list

    ArrayList<PackedTokenAttributeImpl> fixed = new ArrayList<PackedTokenAttributeImpl>();
    for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
        for (PackedTokenAttributeImpl token : tokenSequence) {
            fixed.add(token);
        }
    }

    // reorder by start position and increment

    Collections.sort(fixed, new Comparator<PackedTokenAttributeImpl>() {

        public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) {
            int dif = o1.startOffset() - o2.startOffset();
            if (dif != 0) {
                return dif;
            } else {
                return o1.getPositionIncrement() - o2.getPositionIncrement();
            }
        }
    });

    // make sure we remove any tokens we have duplicated

    @SuppressWarnings("rawtypes")
    OrderedHashSet unique = new OrderedHashSet();
    unique.addAll(fixed);
    fixed = new ArrayList<PackedTokenAttributeImpl>(unique);

    list = fixed;

    // add any missing locales back to the tokens

    if (localePrefix.length() > 0) {
        for (int j = 0; j < list.size(); j++) {
            PackedTokenAttributeImpl currentToken = list.get(j);
            String termText = currentToken.toString();
            currentToken.setEmpty();
            currentToken.append(localePrefix + termText);
        }
    }

    SchemaField sf = schema.getField(field);

    boolean isShingled = false;
    @SuppressWarnings("resource")
    TokenizerChain tokenizerChain = (sf.getType().getQueryAnalyzer() instanceof TokenizerChain)
            ? ((TokenizerChain) sf.getType().getQueryAnalyzer())
            : null;
    if (tokenizerChain != null) {
        for (TokenFilterFactory factory : tokenizerChain.getTokenFilterFactories()) {
            if (factory instanceof ShingleFilterFactory) {
                isShingled = true;
                break;
            }
        }
    }
    @SuppressWarnings("resource")
    AlfrescoAnalyzerWrapper analyzerWrapper = (sf.getType()
            .getQueryAnalyzer() instanceof AlfrescoAnalyzerWrapper)
                    ? ((AlfrescoAnalyzerWrapper) sf.getType().getQueryAnalyzer())
                    : null;
    if (analyzerWrapper != null) {
        // assume if there are no term positions it is shingled ....
        isShingled = true;
    }

    boolean forceConjuncion = rerankPhase == RerankPhase.QUERY_PHASE;

    if (list.size() == 0) {
        return null;
    } else if (list.size() == 1) {
        nextToken = list.get(0);
        String termText = nextToken.toString();
        if (!isNumeric && (termText.contains("*") || termText.contains("?"))) {
            return newWildcardQuery(new Term(field, termText));
        } else {
            return newTermQuery(new Term(field, termText));
        }
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                Builder q = newBooleanQuery();
                for (int i = 0; i < list.size(); i++) {
                    Query currentQuery;
                    nextToken = list.get(i);
                    String termText = nextToken.toString();
                    if (termText.contains("*") || termText.contains("?")) {
                        currentQuery = newWildcardQuery(new Term(field, termText));
                    } else {
                        currentQuery = newTermQuery(new Term(field, termText));
                    }
                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }
                return q.build();
            } else if (forceConjuncion) {
                BooleanQuery.Builder or = new BooleanQuery.Builder();

                for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
                    BooleanQuery.Builder and = new BooleanQuery.Builder();
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i);
                        String termText = nextToken.toString();

                        Term term = new Term(field, termText);
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            and.add(wildQuery, Occur.MUST);
                        } else {
                            TermQuery termQuery = new TermQuery(term);
                            and.add(termQuery, Occur.MUST);
                        }
                    }
                    if (and.build().clauses().size() > 0) {
                        or.add(and.build(), Occur.SHOULD);
                    }
                }
                return or.build();
            }
            // shingle
            else if (sf.omitPositions() && isShingled) {

                ArrayList<PackedTokenAttributeImpl> nonContained = getNonContained(list);
                Query currentQuery;

                BooleanQuery.Builder weakPhrase = new BooleanQuery.Builder();
                for (PackedTokenAttributeImpl shingleToken : nonContained) {
                    String termText = shingleToken.toString();
                    Term term = new Term(field, termText);

                    if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                        currentQuery = new org.apache.lucene.search.WildcardQuery(term);
                    } else {
                        currentQuery = new TermQuery(term);
                    }
                    weakPhrase.add(currentQuery, Occur.MUST);
                }

                return weakPhrase.build();

            }
            // Word delimiter factory and other odd things generate complex
            // token patterns
            // Smart skip token sequences with small tokens that generate
            // toomany wildcards
            // Fall back to the larger pattern
            // e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches
            // too much (S ite1*) and (Site1*) will still be OK
            // If we skip all (for just 1* in the input) this is still an
            // issue.
            else {

                return generateSpanOrQuery(field, fixedTokenSequences);

            }
        } else {
            if (forceConjuncion) {
                BooleanQuery.Builder or = new BooleanQuery.Builder();

                for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
                    BooleanQuery.Builder and = new BooleanQuery.Builder();
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i);
                        String termText = nextToken.toString();

                        Term term = new Term(field, termText);
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            and.add(wildQuery, Occur.MUST);
                        } else {
                            TermQuery termQuery = new TermQuery(term);
                            and.add(termQuery, Occur.MUST);
                        }
                    }
                    if (and.build().clauses().size() > 0) {
                        or.add(and.build(), Occur.SHOULD);
                    }
                }
                return or.build();
            } else {
                SpanQuery spanQuery = null;
                ArrayList<SpanQuery> atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                int gap = 0;
                for (int i = 0; i < list.size(); i++) {
                    nextToken = list.get(i);
                    String termText = nextToken.toString();
                    Term term = new Term(field, termText);
                    if (getEnablePositionIncrements()) {
                        SpanQuery nextSpanQuery;
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>(
                                    wildQuery);
                            wrapper.setRewriteMethod(
                                    new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit));
                            nextSpanQuery = wrapper;
                        } else {
                            nextSpanQuery = new SpanTermQuery(term);
                        }
                        if (gap == 0) {
                            atSamePositionSpanOrQueryParts.add(nextSpanQuery);
                        } else {
                            if (atSamePositionSpanOrQueryParts.size() == 0) {
                                if (spanQuery == null) {
                                    spanQuery = nextSpanQuery;
                                } else {
                                    spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, nextSpanQuery },
                                            (gap - 1) + internalSlop, internalSlop < 2);
                                }
                                atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                            } else if (atSamePositionSpanOrQueryParts.size() == 1) {
                                if (spanQuery == null) {
                                    spanQuery = atSamePositionSpanOrQueryParts.get(0);
                                } else {
                                    spanQuery = new SpanNearQuery(
                                            new SpanQuery[] { spanQuery,
                                                    atSamePositionSpanOrQueryParts.get(0) },
                                            (gap - 1) + internalSlop, internalSlop < 2);
                                }
                                atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                                atSamePositionSpanOrQueryParts.add(nextSpanQuery);
                            } else {
                                if (spanQuery == null) {
                                    spanQuery = new SpanOrQuery(
                                            atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {}));
                                } else {
                                    spanQuery = new SpanNearQuery(
                                            new SpanQuery[] { spanQuery,
                                                    new SpanOrQuery(atSamePositionSpanOrQueryParts
                                                            .toArray(new SpanQuery[] {})) },
                                            (gap - 1) + internalSlop, internalSlop < 2);
                                }
                                atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                                atSamePositionSpanOrQueryParts.add(nextSpanQuery);
                            }
                        }
                        gap = nextToken.getPositionIncrement();
                    } else {
                        SpanQuery nextSpanQuery;
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>(
                                    wildQuery);
                            wrapper.setRewriteMethod(
                                    new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit));
                            nextSpanQuery = wrapper;
                        } else {
                            nextSpanQuery = new SpanTermQuery(term);
                        }
                        if (spanQuery == null) {
                            spanQuery = new SpanOrQuery(nextSpanQuery);
                        } else {
                            spanQuery = new SpanOrQuery(spanQuery, nextSpanQuery);
                        }
                    }
                }
                if (atSamePositionSpanOrQueryParts.size() == 0) {
                    return spanQuery;
                } else if (atSamePositionSpanOrQueryParts.size() == 1) {
                    if (spanQuery == null) {
                        spanQuery = atSamePositionSpanOrQueryParts.get(0);
                    } else {
                        spanQuery = new SpanNearQuery(
                                new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) },
                                (gap - 1) + internalSlop, internalSlop < 2);
                    }
                    return spanQuery;
                } else {
                    if (spanQuery == null) {
                        spanQuery = new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {}));
                    } else {
                        spanQuery = new SpanNearQuery(
                                new SpanQuery[] { spanQuery,
                                        new SpanOrQuery(
                                                atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})) },
                                (gap - 1) + internalSlop, internalSlop < 2);
                    }
                    return spanQuery;
                }
            }
        }
    }
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

private String getFirstTokenForRange(String string, FieldInstance field) throws IOException {
    PackedTokenAttributeImpl nextToken;//from  w w  w. ja  v a  2  s . co  m
    TokenStream source = null;
    ;

    try {
        source = getAnalyzer().tokenStream(field.getField(), new StringReader(string));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }

            return nextToken.toString();
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }
    return null;
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

/**
 * @param first//from w  ww  . ja  v  a  2s . c  om
 * @param field
 * @return SpanOrQuery
 * @throws IOException
 */
private SpanQuery buildSpanOrQuery(String first, FieldInstance field) throws IOException {
    ArrayList<SpanQuery> spanOrQueryParts = new ArrayList<SpanQuery>();

    PackedTokenAttributeImpl nextToken;
    TokenStream source = null;

    try {
        source = getAnalyzer().tokenStream(field.getField(), new StringReader(first));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }

            SpanQuery termQuery = new SpanTermQuery(new Term(field.getField(), nextToken.toString()));
            spanOrQueryParts.add(termQuery);
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }

    if (spanOrQueryParts.size() == 1) {
        return spanOrQueryParts.get(0);
    } else {
        return new SpanOrQuery(spanOrQueryParts.toArray(new SpanQuery[] {}));
    }
}

From source file:org.alfresco.solr.SolrInformationServer.java

License:Open Source License

private void addContentPropertyToDocUsingAlfrescoRepository(SolrInputDocument doc, QName propertyQName,
        long dbId, String locale) throws AuthenticationException, IOException {
    long start = System.nanoTime();

    // Expensive call to be done with ContentTracker
    GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null);

    addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_STATUS,
            response);/*w ww  .  j  a v a  2 s .co  m*/
    addContentPropertyMetadata(doc, propertyQName,
            AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_EXCEPTION, response);
    addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_TIME,
            response);

    InputStream ris = response.getContent();
    String textContent = "";
    try {
        if (ris != null) {
            // Get and copy content
            byte[] bytes = FileCopyUtils.copyToByteArray(new BoundedInputStream(ris, contentStreamLimit));
            textContent = new String(bytes, StandardCharsets.UTF_8);
        }
    } finally {
        // release the response only when the content has been read
        response.release();
    }

    if (minHash && textContent.length() > 0) {
        Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
        TokenStream ts = analyzer.tokenStream("min_hash", textContent);
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            StringBuilder tokenBuff = new StringBuilder();
            char[] buff = termAttribute.buffer();

            for (int i = 0; i < termAttribute.length(); i++) {
                tokenBuff.append(Integer.toHexString(buff[i]));
            }
            doc.addField(FINGERPRINT_FIELD, tokenBuff.toString());

        }
        ts.end();
        ts.close();
    }

    long end = System.nanoTime();
    this.getTrackerStats().addDocTransformationTime(end - start);

    StringBuilder builder = new StringBuilder(textContent.length() + 16);
    builder.append("\u0000").append(locale).append("\u0000");
    builder.append(textContent);
    String localisedText = builder.toString();

    for (FieldInstance field : AlfrescoSolrDataModel.getInstance()
            .getIndexedFieldNamesForProperty(propertyQName).getFields()) {
        doc.removeField(field.getField());
        if (field.isLocalised()) {
            doc.addField(field.getField(), localisedText);
        } else {
            doc.addField(field.getField(), textContent);
        }
        addFieldIfNotSet(doc, field);
    }
}