Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsSynonymFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" });
    try {//w w  w  .  jav a  2s . co  m
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Kln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsFilter.java

License:Apache License

/**
 * Test program// w  w w.j av a  2s  .c  om
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {
        ts = new TranscribeGermanAccentsFilter(ts);

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsSynonymFilter.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {//from   www  .j av  a  2s .c  o  m
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        ts.reset();
        ts = new RemoveAllAccentsFilter(ts);
        ts.reset();

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.index.complex.TokenStreamFromList.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream s = new TokenStreamFromList(Arrays.asList("a", "b", "c"), Arrays.asList(1, 1, 1));
    try {//www  .  j  av  a2  s  . c  o  m
        CharTermAttribute term = s.addAttribute(CharTermAttribute.class);
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        System.out.println(s.incrementToken());
    } finally {
        s.close();
    }
}

From source file:nl.uva.sne.commons.SemanticUtils.java

public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException {
    text = text.replaceAll("", "'");

    text = text.replaceAll("_", " ");
    text = text.replaceAll("[0-9]", "");
    text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " ");

    text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
    text = text.toLowerCase();/*  www .j a  v  a  2  s.c  om*/

    TokenStream tokenStream;
    if (stem) {
        tokenStream = tokenStemStream("field", new StringReader(text));
    } else {
        tokenStream = tokenStream("field", new StringReader(text));
    }

    ArrayList<String> words = new ArrayList<>();
    try {
        CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            words.add(term.toString());
        }
        tokenStream.end();
    } finally {
        tokenStream.close();
    }
    //        Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens");
    return words;
}

From source file:org.alfresco.repo.search.impl.lucene.AbstractLuceneQueryParser.java

License:Open Source License

@SuppressWarnings("unchecked")
protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode,
        LuceneFunction luceneFunction) throws ParseException {
    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or noth

    // TODO: Untokenised columns with functions require special handling

    if (luceneFunction != LuceneFunction.FIELD) {
        throw new UnsupportedOperationException(
                "Field queries are not supported on lucene functions (UPPER, LOWER, etc)");
    }/*from w  ww  . j a  v  a  2s . com*/

    // if the incoming string already has a language identifier we strip it iff and addit back on again

    String localePrefix = "";

    String toTokenise = queryText;

    if (queryText.startsWith("{")) {
        int position = queryText.indexOf("}");
        String language = queryText.substring(0, position + 1);
        Locale locale = new Locale(queryText.substring(1, position));
        String token = queryText.substring(position + 1);
        boolean found = false;
        if (!locale.toString().isEmpty()) {
            for (Locale current : Locale.getAvailableLocales()) {
                if (current.toString().equalsIgnoreCase(locale.toString())) {
                    found = true;
                    break;
                }
            }
        }
        if (found) {
            localePrefix = language;
            toTokenise = token;
        } else {
            toTokenise = token;
        }
    }

    String testText = toTokenise;
    boolean requiresMLTokenDuplication = false;
    String localeString = null;
    if (field.startsWith(PROPERTY_FIELD_PREFIX) && (localePrefix.length() == 0)) {
        if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) {
            int position = queryText.indexOf("\u0000", 1);
            testText = queryText.substring(position + 1);
            requiresMLTokenDuplication = true;
            localeString = queryText.substring(1, position);
        }
    }

    // find the positions of any escaped * and ? and ignore them

    Set<Integer> wildcardPoistions = getWildcardPositions(testText);

    TokenStream source;
    if ((localePrefix.length() == 0) || (wildcardPoistions.size() > 0)
            || (analysisMode == AnalysisMode.IDENTIFIER)) {
        source = getAnalyzer().tokenStream(field, new StringReader(toTokenise), analysisMode);
    } else {
        source = getAnalyzer().tokenStream(field, new StringReader(
                "\u0000" + localePrefix.substring(1, localePrefix.length() - 1) + "\u0000" + toTokenise),
                analysisMode);
        localePrefix = "";
    }

    ArrayList<org.apache.lucene.analysis.Token> list = new ArrayList<org.apache.lucene.analysis.Token>();
    org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
    org.apache.lucene.analysis.Token nextToken;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    while (true) {
        try {
            nextToken = source.next(reusableToken);
        } catch (IOException e) {
            nextToken = null;
        }
        if (nextToken == null)
            break;
        list.add((org.apache.lucene.analysis.Token) nextToken.clone());
        if (nextToken.getPositionIncrement() != 0)
            positionCount += nextToken.getPositionIncrement();
        else
            severalTokensAtSamePosition = true;
    }
    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    // add any alpha numeric wildcards that have been missed
    // Fixes most stop word and wild card issues

    for (int index = 0; index < testText.length(); index++) {
        char current = testText.charAt(index);
        if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) {
            StringBuilder pre = new StringBuilder(10);
            if (index == 0) {
                // "*" and "?" at the start

                boolean found = false;
                for (int j = 0; j < list.size(); j++) {
                    org.apache.lucene.analysis.Token test = list.get(j);
                    if ((test.startOffset() <= 0) && (0 < test.endOffset())) {
                        found = true;
                        break;
                    }
                }
                if (!found && (testText.length() == 1)) {
                    // Add new token followed by * not given by the tokeniser
                    org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(0, 0);
                    newToken.setTermBuffer("");
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null
                                ? defaultSearchMLAnalysisMode
                                : searchParameters.getMlAnalaysisMode();
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode);
                        Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            } else if (index > 0) {
                // Add * and ? back into any tokens from which it has been removed

                boolean tokenFound = false;
                for (int j = 0; j < list.size(); j++) {
                    org.apache.lucene.analysis.Token test = list.get(j);
                    if ((test.startOffset() <= index) && (index < test.endOffset())) {
                        if (requiresMLTokenDuplication) {
                            String termText = new String(test.termBuffer(), 0, test.termLength());
                            int position = termText.indexOf("}");
                            String language = termText.substring(0, position + 1);
                            String token = termText.substring(position + 1);
                            if (index >= test.startOffset() + token.length()) {
                                test.setTermBuffer(language + token + current);
                            }
                        } else {
                            if (index >= test.startOffset() + test.termLength()) {
                                test.setTermBuffer(test.term() + current);
                            }
                        }
                        tokenFound = true;
                        break;
                    }
                }

                if (!tokenFound) {
                    for (int i = index - 1; i >= 0; i--) {
                        char c = testText.charAt(i);
                        if (Character.isLetterOrDigit(c)) {
                            boolean found = false;
                            for (int j = 0; j < list.size(); j++) {
                                org.apache.lucene.analysis.Token test = list.get(j);
                                if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                    found = true;
                                    break;
                                }
                            }
                            if (found) {
                                break;
                            } else {
                                pre.insert(0, c);
                            }
                        } else {
                            break;
                        }
                    }
                    if (pre.length() > 0) {
                        // Add new token followed by * not given by the tokeniser
                        org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(
                                index - pre.length(), index);
                        newToken.setTermBuffer(pre.toString());
                        newToken.setType("ALPHANUM");
                        if (requiresMLTokenDuplication) {
                            Locale locale = I18NUtil.parseLocale(localeString);
                            MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null
                                    ? defaultSearchMLAnalysisMode
                                    : searchParameters.getMlAnalaysisMode();
                            MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode);
                            Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken);
                            if (it != null) {
                                int count = 0;
                                while (it.hasNext()) {
                                    list.add(it.next());
                                    count++;
                                    if (count > 1) {
                                        severalTokensAtSamePosition = true;
                                    }
                                }
                            }
                        }
                        // content
                        else {
                            list.add(newToken);
                        }
                    }
                }
            }

            StringBuilder post = new StringBuilder(10);
            if (index > 0) {
                for (int i = index + 1; i < testText.length(); i++) {
                    char c = testText.charAt(i);
                    if (Character.isLetterOrDigit(c)) {
                        boolean found = false;
                        for (int j = 0; j < list.size(); j++) {
                            org.apache.lucene.analysis.Token test = list.get(j);
                            if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                found = true;
                                break;
                            }
                        }
                        if (found) {
                            break;
                        } else {
                            post.append(c);
                        }
                    } else {
                        break;
                    }
                }
                if (post.length() > 0) {
                    // Add new token followed by * not given by the tokeniser
                    org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(index + 1,
                            index + 1 + post.length());
                    newToken.setTermBuffer(post.toString());
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null
                                ? defaultSearchMLAnalysisMode
                                : searchParameters.getMlAnalaysisMode();
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode);
                        Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            }

        }
    }

    Collections.sort(list, new Comparator<org.apache.lucene.analysis.Token>() {

        public int compare(Token o1, Token o2) {
            int dif = o1.startOffset() - o2.startOffset();
            if (dif != 0) {
                return dif;
            } else {
                return o2.getPositionIncrement() - o1.getPositionIncrement();
            }
        }
    });

    // Combined * and ? based strings - should redo the tokeniser

    // Build tokens by position

    LinkedList<LinkedList<org.apache.lucene.analysis.Token>> tokensByPosition = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>();
    LinkedList<org.apache.lucene.analysis.Token> currentList = null;
    for (org.apache.lucene.analysis.Token c : list) {
        if (c.getPositionIncrement() == 0) {
            if (currentList == null) {
                currentList = new LinkedList<org.apache.lucene.analysis.Token>();
                tokensByPosition.add(currentList);
            }
            currentList.add(c);
        } else {
            currentList = new LinkedList<org.apache.lucene.analysis.Token>();
            tokensByPosition.add(currentList);
            currentList.add(c);
        }
    }

    // Build all the token sequences and see which ones get strung together

    LinkedList<LinkedList<org.apache.lucene.analysis.Token>> allTokenSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>();
    for (LinkedList<org.apache.lucene.analysis.Token> tokensAtPosition : tokensByPosition) {
        if (allTokenSequences.size() == 0) {
            for (org.apache.lucene.analysis.Token t : tokensAtPosition) {
                LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>();
                newEntry.add(t);
                allTokenSequences.add(newEntry);
            }
        } else {
            LinkedList<LinkedList<org.apache.lucene.analysis.Token>> newAllTokeSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>();

            FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (org.apache.lucene.analysis.Token t : tokensAtPosition) {
                boolean tokenFoundSequence = false;
                for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : allTokenSequences) {
                    LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>();
                    newEntry.addAll(tokenSequence);
                    if (newEntry.getLast().endOffset() <= t.startOffset()) {
                        newEntry.add(t);
                        tokenFoundSequence = true;
                    }
                    newAllTokeSequences.add(newEntry);
                }
                if (false == tokenFoundSequence) {
                    LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>();
                    newEntry.add(t);
                    newAllTokeSequences.add(newEntry);
                }
                // Limit the max number of permutations we consider
                if (newAllTokeSequences.size() > 64) {
                    break FOR_FIRST_TOKEN_AT_POSITION_ONLY;
                }
            }
            allTokenSequences = newAllTokeSequences;
        }
    }

    // build the uniquie

    LinkedList<LinkedList<org.apache.lucene.analysis.Token>> fixedTokenSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>();
    for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : allTokenSequences) {
        LinkedList<org.apache.lucene.analysis.Token> fixedTokenSequence = new LinkedList<org.apache.lucene.analysis.Token>();
        fixedTokenSequences.add(fixedTokenSequence);
        org.apache.lucene.analysis.Token replace = null;
        for (org.apache.lucene.analysis.Token c : tokenSequence) {
            if (replace == null) {
                StringBuilder prefix = new StringBuilder();
                for (int i = c.startOffset() - 1; i >= 0; i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        prefix.insert(0, test);
                    } else {
                        break;
                    }
                }
                String pre = prefix.toString();
                if (requiresMLTokenDuplication) {
                    String termText = new String(c.termBuffer(), 0, c.termLength());
                    int position = termText.indexOf("}");
                    String language = termText.substring(0, position + 1);
                    String token = termText.substring(position + 1);
                    replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(),
                            c.endOffset());
                    replace.setTermBuffer(language + pre + token);
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                } else {
                    String termText = new String(c.termBuffer(), 0, c.termLength());
                    replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(),
                            c.endOffset());
                    replace.setTermBuffer(pre + termText);
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                }
            } else {
                StringBuilder prefix = new StringBuilder();
                StringBuilder postfix = new StringBuilder();
                StringBuilder builder = prefix;
                for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        builder.insert(0, test);
                    } else {
                        builder = postfix;
                        postfix.setLength(0);
                    }
                }
                String pre = prefix.toString();
                String post = postfix.toString();

                // Does it bridge?
                if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) {
                    String termText = new String(c.termBuffer(), 0, c.termLength());
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        @SuppressWarnings("unused")
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
                        replace = new org.apache.lucene.analysis.Token(replace.startOffset(), c.endOffset());
                        replace.setTermBuffer(replaceTermText + pre + token);
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    } else {
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
                        replace = new org.apache.lucene.analysis.Token(replace.startOffset(), c.endOffset());
                        replace.setTermBuffer(replaceTermText + pre + termText);
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    }
                } else {
                    String termText = new String(c.termBuffer(), 0, c.termLength());
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
                        org.apache.lucene.analysis.Token last = new org.apache.lucene.analysis.Token(
                                replace.startOffset(), replace.endOffset() + post.length());
                        last.setTermBuffer(replaceTermText + post);
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(),
                                c.endOffset());
                        replace.setTermBuffer(language + pre + token);
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    } else {
                        String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
                        org.apache.lucene.analysis.Token last = new org.apache.lucene.analysis.Token(
                                replace.startOffset(), replace.endOffset() + post.length());
                        last.setTermBuffer(replaceTermText + post);
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(),
                                c.endOffset());
                        replace.setTermBuffer(pre + termText);
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    }
                }
            }
        }
        // finish last
        if (replace != null) {
            StringBuilder postfix = new StringBuilder();
            if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) {
                for (int i = replace.endOffset(); i < testText.length(); i++) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        postfix.append(test);
                    } else {
                        break;
                    }
                }
            }
            String post = postfix.toString();
            int oldPositionIncrement = replace.getPositionIncrement();
            String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
            replace = new org.apache.lucene.analysis.Token(replace.startOffset(),
                    replace.endOffset() + post.length());
            replace.setTermBuffer(replaceTermText + post);
            replace.setType(replace.type());
            replace.setPositionIncrement(oldPositionIncrement);
            fixedTokenSequence.add(replace);
        }
    }

    // rebuild fixed list

    ArrayList<org.apache.lucene.analysis.Token> fixed = new ArrayList<org.apache.lucene.analysis.Token>();
    for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : fixedTokenSequences) {
        for (org.apache.lucene.analysis.Token token : tokenSequence) {
            fixed.add(token);
        }
    }

    // reorder by start position and increment

    Collections.sort(fixed, new Comparator<org.apache.lucene.analysis.Token>() {

        public int compare(Token o1, Token o2) {
            int dif = o1.startOffset() - o2.startOffset();
            if (dif != 0) {
                return dif;
            } else {
                return o1.getPositionIncrement() - o2.getPositionIncrement();
            }
        }
    });

    // make sure we remove any tokens we have duplicated

    @SuppressWarnings("rawtypes")
    OrderedHashSet unique = new OrderedHashSet();
    unique.addAll(fixed);
    fixed = new ArrayList<org.apache.lucene.analysis.Token>(unique);

    list = fixed;

    // add any missing locales back to the tokens

    if (localePrefix.length() > 0) {
        for (int j = 0; j < list.size(); j++) {
            org.apache.lucene.analysis.Token currentToken = list.get(j);
            String termText = new String(currentToken.termBuffer(), 0, currentToken.termLength());
            currentToken.setTermBuffer(localePrefix + termText);
        }
    }

    if (list.size() == 0)
        return null;
    else if (list.size() == 1) {
        nextToken = list.get(0);
        String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());
        if (termText.contains("*") || termText.contains("?")) {
            return newWildcardQuery(
                    new Term(field, getLowercaseExpandedTerms() ? termText.toLowerCase() : termText));
        } else {
            return newTermQuery(new Term(field, termText));
        }
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                BooleanQuery q = newBooleanQuery(true);
                for (int i = 0; i < list.size(); i++) {
                    Query currentQuery;
                    nextToken = list.get(i);
                    String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());
                    if (termText.contains("*") || termText.contains("?")) {
                        currentQuery = newWildcardQuery(new Term(field,
                                getLowercaseExpandedTerms() ? termText.toLowerCase() : termText));
                    } else {
                        currentQuery = newTermQuery(new Term(field, termText));
                    }
                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }
                return q;
            }
            // Consider if we can use a multi-phrase query (e.g for synonym use rather then WordDelimiterFilterFactory)
            else if (canUseMultiPhraseQuery(fixedTokenSequences)) {
                // phrase query:
                MultiPhraseQuery mpq = newMultiPhraseQuery();
                mpq.setSlop(internalSlop);
                ArrayList<Term> multiTerms = new ArrayList<Term>();
                int position = 0;
                for (int i = 0; i < list.size(); i++) {
                    nextToken = list.get(i);
                    String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());

                    Term term = new Term(field, termText);
                    if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                        addWildcardTerms(multiTerms, term);
                    } else {
                        multiTerms.add(term);
                    }

                    if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
                        if (getEnablePositionIncrements()) {
                            mpq.add(multiTerms.toArray(new Term[0]), position);
                        } else {
                            mpq.add(multiTerms.toArray(new Term[0]));
                        }
                        checkTermCount(field, queryText, mpq);
                        multiTerms.clear();
                    }
                    position += nextToken.getPositionIncrement();

                }
                if (getEnablePositionIncrements()) {
                    if (multiTerms.size() > 0) {
                        mpq.add(multiTerms.toArray(new Term[0]), position);
                    }
                    //                        else
                    //                        {
                    //                            mpq.add(new Term[] { new Term(field, "\u0000") }, position);
                    //                        }
                } else {
                    if (multiTerms.size() > 0) {
                        mpq.add(multiTerms.toArray(new Term[0]));
                    }
                    //                        else
                    //                        {
                    //                            mpq.add(new Term[] { new Term(field, "\u0000") });
                    //                        }
                }
                checkTermCount(field, queryText, mpq);
                return mpq;

            }
            // Word delimiter factory and other odd things generate complex token patterns
            // Smart skip token  sequences with small tokens that generate toomany wildcards
            // Fall back to the larger pattern
            // e.g Site1* will not do (S ite 1*) or (Site 1*)  if 1* matches too much (S ite1*)  and (Site1*) will still be OK 
            // If we skip all (for just 1* in the input) this is still an issue.
            else {
                boolean skippedTokens = false;
                BooleanQuery q = newBooleanQuery(true);
                TOKEN_SEQUENCE: for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : fixedTokenSequences) {
                    // phrase query:
                    MultiPhraseQuery mpq = newMultiPhraseQuery();
                    mpq.setSlop(internalSlop);
                    int position = 0;
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        nextToken = (org.apache.lucene.analysis.Token) tokenSequence.get(i);
                        String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());

                        Term term = new Term(field, termText);

                        if (getEnablePositionIncrements()) {
                            if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                                mpq.add(getMatchingTerms(field, term), position);
                            } else {
                                mpq.add(new Term[] { term }, position);
                            }
                            if (exceedsTermCount(mpq)) {
                                // We could duplicate the token sequence without the failing wildcard expansion and try again ??
                                skippedTokens = true;
                                continue TOKEN_SEQUENCE;
                            }
                            if (nextToken.getPositionIncrement() > 0) {
                                position += nextToken.getPositionIncrement();
                            } else {
                                position++;
                            }

                        } else {
                            if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                                mpq.add(getMatchingTerms(field, term));
                            } else {
                                mpq.add(term);
                            }
                            if (exceedsTermCount(mpq)) {
                                skippedTokens = true;
                                continue TOKEN_SEQUENCE;
                            }
                        }
                    }
                    q.add(mpq, BooleanClause.Occur.SHOULD);
                }
                if (skippedTokens && (q.clauses().size() == 0)) {
                    throw new LuceneQueryParserException(
                            "Query skipped all token sequences as wildcards generated too many clauses: "
                                    + field + " " + queryText);
                }
                return q;
            }
        } else {
            MultiPhraseQuery q = new MultiPhraseQuery();
            q.setSlop(internalSlop);
            int position = 0;
            for (int i = 0; i < list.size(); i++) {
                nextToken = list.get(i);
                String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());
                Term term = new Term(field, termText);
                if (getEnablePositionIncrements()) {
                    if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                        q.add(getMatchingTerms(field, term), position);
                    } else {
                        q.add(new Term[] { term }, position);
                    }
                    checkTermCount(field, queryText, q);
                    if (nextToken.getPositionIncrement() > 0) {
                        position += nextToken.getPositionIncrement();
                    } else {
                        position++;
                    }
                } else {
                    if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                        q.add(getMatchingTerms(field, term));
                    } else {
                        q.add(term);
                    }
                    checkTermCount(field, queryText, q);
                }
            }
            return q;
        }
    }
}

From source file:org.alfresco.repo.search.impl.lucene.AbstractLuceneQueryParser.java

License:Open Source License

protected String getToken(String field, String value, AnalysisMode analysisMode) throws ParseException {
    TokenStream source = getAnalyzer().tokenStream(field, new StringReader(value), analysisMode);
    org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
    org.apache.lucene.analysis.Token nextToken;
    String tokenised = null;/*w w  w.jav  a 2s.co m*/

    while (true) {
        try {
            nextToken = source.next(reusableToken);
        } catch (IOException e) {
            nextToken = null;
        }
        if (nextToken == null)
            break;
        tokenised = new String(nextToken.termBuffer(), 0, nextToken.termLength());
    }
    try {
        source.close();
    } catch (IOException e) {

    }

    return tokenised;
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.MLAnalayserTest.java

License:Open Source License

/**
 * Check that the TokenStream yields the exact tokens specified.
 * Note that order is not checked, since the map of locales will not provide a
 * predictable ordering when enumerated.
 * /*from ww w  .  jav a2s .c o  m*/
 * The expected list of tokens may contain the same token more than once and
 * the number of instances will have to match the number found in the stream.
 * 
 * @param ts              TokenStream to inspect.
 * @param expectedTokens  List of tokens in the order expected from the stream.
 * @throws IOException
 */
private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException {
    final int expectedCount = expectedTokens.size();
    int count = 0;

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            count++;
            System.out.println("Token: " + termAtt.toString());
            if (expectedTokens.contains(termAtt.toString())) {
                // remove an instance of the term text so that it is not matched again
                expectedTokens.remove(termAtt.toString());
            } else {
                fail("Unexpected token: " + termAtt.toString());
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    assertEquals("Incorrect number of tokens generated.", expectedCount, count);
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java

License:Open Source License

private void tokenise(TokenStream ts, String[] tokens) throws IOException {
    int i = 0;//  ww  w . j  a v a 2s. c o m

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println("token: " + ts.reflectAsString(true));

            String termText = termAtt.toString();

            if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) {
                assert (i % 2 == 1);
                assertEquals(termText, tokens[i++]);
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    if (i != tokens.length) {
        fail("Invalid number of tokens, found " + i + " and expected " + tokens.length);
    }
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

private ArrayList<String> getTokens(IndexableField indexableField) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();

    TokenStream ts = indexableField.tokenStream(schema.getIndexAnalyzer(), null);
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from w w  w  .  j  a v  a 2  s.c  om
    while (ts.incrementToken()) {
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();

    return tokens;
}