Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

@SuppressWarnings("unchecked")
protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode,
        LuceneFunction luceneFunction) throws ParseException, IOException {
    // make sure the field exists or return a dummy query so we have no
    // error ....ACE-3231
    SchemaField schemaField = schema.getFieldOrNull(field);
    boolean isNumeric = false;
    if (schemaField == null) {
        return new TermQuery(new Term("_dummy_", "_miss_"));
    } else {/*w  w  w .  j  a  v  a2  s.com*/
        isNumeric = (schemaField.getType().getNumericType() != null);
        if (isNumeric) {
            //Check to see if queryText is numeric or else it will fail.
            try {
                Double.valueOf(queryText);
            } catch (NumberFormatException e) {
                return new TermQuery(new Term("_dummy_", "_miss_"));
            }
        }
    }

    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or noth

    // TODO: Untokenised columns with functions require special handling

    if (luceneFunction != LuceneFunction.FIELD) {
        throw new UnsupportedOperationException(
                "Field queries are not supported on lucene functions (UPPER, LOWER, etc)");
    }

    // if the incoming string already has a language identifier we strip it
    // iff and addit back on again

    String localePrefix = "";

    String toTokenise = queryText;

    if (queryText.startsWith("{")) {
        int position = queryText.indexOf("}");
        if (position > 0) {
            String language = queryText.substring(0, position + 1);
            Locale locale = new Locale(queryText.substring(1, position));
            String token = queryText.substring(position + 1);
            boolean found = false;
            for (Locale current : Locale.getAvailableLocales()) {
                if (current.toString().equalsIgnoreCase(locale.toString())) {
                    found = true;
                    break;
                }
            }
            if (found) {
                localePrefix = language;
                toTokenise = token;
            } else {
                // toTokenise = token;
            }
        }
    }

    String testText = toTokenise;
    boolean requiresMLTokenDuplication = false;
    String localeString = null;
    if (isPropertyField(field) && (localePrefix.length() == 0)) {
        if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) {
            int position = queryText.indexOf("\u0000", 1);
            testText = queryText.substring(position + 1);
            requiresMLTokenDuplication = true;
            localeString = queryText.substring(1, position);

        }
    }

    // find the positions of any escaped * and ? and ignore them

    Set<Integer> wildcardPoistions = getWildcardPositions(testText);

    TokenStream source = null;
    ArrayList<PackedTokenAttributeImpl> list = new ArrayList<PackedTokenAttributeImpl>();
    boolean severalTokensAtSamePosition = false;
    PackedTokenAttributeImpl nextToken;
    int positionCount = 0;

    try {
        source = getAnalyzer().tokenStream(field, new StringReader(toTokenise));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }

            list.add(nextToken);
            if (nextToken.getPositionIncrement() != 0)
                positionCount += nextToken.getPositionIncrement();
            else
                severalTokensAtSamePosition = true;
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }

    // add any alpha numeric wildcards that have been missed
    // Fixes most stop word and wild card issues

    for (int index = 0; index < testText.length(); index++) {
        char current = testText.charAt(index);
        if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) {
            StringBuilder pre = new StringBuilder(10);
            if (index == 0) {
                // "*" and "?" at the start

                boolean found = false;
                for (int j = 0; j < list.size(); j++) {
                    PackedTokenAttributeImpl test = list.get(j);
                    if ((test.startOffset() <= 0) && (0 < test.endOffset())) {
                        found = true;
                        break;
                    }
                }
                if (!found && (list.size() == 0)) {
                    // Add new token followed by * not given by the
                    // tokeniser
                    PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                    newToken.setEmpty().append("", 0, 0);
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        @SuppressWarnings("resource")
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale,
                                MLAnalysisMode.EXACT_LANGUAGE);
                        Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            } else if (index > 0) {
                // Add * and ? back into any tokens from which it has been
                // removed

                boolean tokenFound = false;
                for (int j = 0; j < list.size(); j++) {
                    PackedTokenAttributeImpl test = list.get(j);
                    if ((test.startOffset() <= index) && (index < test.endOffset())) {
                        if (requiresMLTokenDuplication) {
                            String termText = test.toString();
                            int position = termText.indexOf("}");
                            String language = termText.substring(0, position + 1);
                            String token = termText.substring(position + 1);
                            if (index >= test.startOffset() + token.length()) {
                                test.setEmpty();
                                test.append(language + token + current);
                            }
                        } else {
                            if (index >= test.startOffset() + test.length()) {
                                test.setEmpty();
                                test.append(test.toString() + current);
                            }
                        }
                        tokenFound = true;
                        break;
                    }
                }

                if (!tokenFound) {
                    for (int i = index - 1; i >= 0; i--) {
                        char c = testText.charAt(i);
                        if (Character.isLetterOrDigit(c)) {
                            boolean found = false;
                            for (int j = 0; j < list.size(); j++) {
                                PackedTokenAttributeImpl test = list.get(j);
                                if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                    found = true;
                                    break;
                                }
                            }
                            if (found) {
                                break;
                            } else {
                                pre.insert(0, c);
                            }
                        } else {
                            break;
                        }
                    }
                    if (pre.length() > 0) {
                        // Add new token followed by * not given by the
                        // tokeniser
                        PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                        newToken.setEmpty().append(pre.toString());
                        newToken.setOffset(index - pre.length(), index);
                        newToken.setType("ALPHANUM");
                        if (requiresMLTokenDuplication) {
                            Locale locale = I18NUtil.parseLocale(localeString);
                            @SuppressWarnings("resource")
                            MLTokenDuplicator duplicator = new MLTokenDuplicator(locale,
                                    MLAnalysisMode.EXACT_LANGUAGE);
                            Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
                            if (it != null) {
                                int count = 0;
                                while (it.hasNext()) {
                                    list.add(it.next());
                                    count++;
                                    if (count > 1) {
                                        severalTokensAtSamePosition = true;
                                    }
                                }
                            }
                        }
                        // content
                        else {
                            list.add(newToken);
                        }
                    }
                }
            }

            StringBuilder post = new StringBuilder(10);
            if (index > 0) {
                for (int i = index + 1; i < testText.length(); i++) {
                    char c = testText.charAt(i);
                    if (Character.isLetterOrDigit(c)) {
                        boolean found = false;
                        for (int j = 0; j < list.size(); j++) {
                            PackedTokenAttributeImpl test = list.get(j);
                            if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                found = true;
                                break;
                            }
                        }
                        if (found) {
                            break;
                        } else {
                            post.append(c);
                        }
                    } else {
                        break;
                    }
                }
                if (post.length() > 0) {
                    // Add new token followed by * not given by the
                    // tokeniser
                    PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                    newToken.setEmpty().append(post.toString());
                    newToken.setOffset(index + 1, index + 1 + post.length());
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        @SuppressWarnings("resource")
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale,
                                MLAnalysisMode.EXACT_LANGUAGE);
                        Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            }

        }
    }

    // Put in real position increments as we treat them correctly

    int curentIncrement = -1;
    for (PackedTokenAttributeImpl c : list) {
        if (curentIncrement == -1) {
            curentIncrement = c.getPositionIncrement();
        } else if (c.getPositionIncrement() > 0) {
            curentIncrement = c.getPositionIncrement();
        } else {
            c.setPositionIncrement(curentIncrement);
        }
    }

    // Fix up position increments for in phrase isolated wildcards

    boolean lastWasWild = false;
    for (int i = 0; i < list.size() - 1; i++) {
        for (int j = list.get(i).endOffset() + 1; j < list.get(i + 1).startOffset() - 1; j++) {
            if (wildcardPoistions.contains(j)) {
                if (!lastWasWild) {
                    list.get(i + 1).setPositionIncrement(list.get(i + 1).getPositionIncrement() + 1);
                }
                lastWasWild = true;
            } else {
                lastWasWild = false;
            }
        }
    }

    Collections.sort(list, new Comparator<PackedTokenAttributeImpl>() {

        public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) {
            int dif = o1.startOffset() - o2.startOffset();
            return dif;

        }
    });

    // Combined * and ? based strings - should redo the tokeniser

    // Build tokens by position

    LinkedList<LinkedList<PackedTokenAttributeImpl>> tokensByPosition = new LinkedList<LinkedList<PackedTokenAttributeImpl>>();
    LinkedList<PackedTokenAttributeImpl> currentList = null;
    int lastStart = 0;
    for (PackedTokenAttributeImpl c : list) {
        if (c.startOffset() == lastStart) {
            if (currentList == null) {
                currentList = new LinkedList<PackedTokenAttributeImpl>();
                tokensByPosition.add(currentList);
            }
            currentList.add(c);
        } else {
            currentList = new LinkedList<PackedTokenAttributeImpl>();
            tokensByPosition.add(currentList);
            currentList.add(c);
        }
        lastStart = c.startOffset();
    }

    // Build all the token sequences and see which ones get strung together

    OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> allTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();
    for (LinkedList<PackedTokenAttributeImpl> tokensAtPosition : tokensByPosition) {
        OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> positionalSynonymSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();

        OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> newAllTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();

        FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (PackedTokenAttributeImpl t : tokensAtPosition) {
            PackedTokenAttributeImpl replace = new PackedTokenAttributeImpl();
            replace.setEmpty().append(t);
            replace.setOffset(t.startOffset(), t.endOffset());
            replace.setType(t.type());
            replace.setPositionIncrement(t.getPositionIncrement());

            boolean tokenFoundSequence = false;
            for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequencesSet) {
                LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
                newEntry.addAll(tokenSequence);
                if ((newEntry.getLast().endOffset() == replace.endOffset())
                        && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                    if ((newEntry.getLast().startOffset() == replace.startOffset())
                            && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) {
                        positionalSynonymSequencesSet.add(tokenSequence);
                        newEntry.add(replace);
                        tokenFoundSequence = true;
                    } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) {
                        if (newEntry.toString().endsWith(replace.toString())) {
                            // already in the gram
                            positionalSynonymSequencesSet.add(tokenSequence);
                            tokenFoundSequence = true;
                        } else {
                            // need to replace the synonym in the current
                            // gram
                            tokenFoundSequence = true;
                            StringBuffer old = new StringBuffer(newEntry.getLast().toString());
                            old.replace(replace.startOffset() - newEntry.getLast().startOffset(),
                                    replace.endOffset() - newEntry.getLast().startOffset(), replace.toString());
                            PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                            newToken.setEmpty().append(old.toString());
                            newToken.setOffset(newEntry.getLast().startOffset(),
                                    newEntry.getLast().endOffset());
                            newEntry.removeLast();
                            newEntry.add(newToken);
                        }
                    }
                } else if ((newEntry.getLast().startOffset() < replace.startOffset())
                        && (newEntry.getLast().endOffset() < replace.endOffset())) {
                    if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)
                            && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                        positionalSynonymSequencesSet.add(tokenSequence);
                    }
                    newEntry.add(replace);
                    tokenFoundSequence = true;
                }
                newAllTokenSequencesSet.add(newEntry);
            }
            if (false == tokenFoundSequence) {
                for (LinkedList<PackedTokenAttributeImpl> tokenSequence : newAllTokenSequencesSet) {
                    LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
                    newEntry.addAll(tokenSequence);
                    if ((newEntry.getLast().endOffset() == replace.endOffset())
                            && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                        if ((newEntry.getLast().startOffset() == replace.startOffset())
                                && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) {
                            positionalSynonymSequencesSet.add(tokenSequence);
                            newEntry.add(replace);
                            tokenFoundSequence = true;
                        } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) {
                            if (newEntry.toString().endsWith(replace.toString())) {
                                // already in the gram
                                positionalSynonymSequencesSet.add(tokenSequence);
                                tokenFoundSequence = true;
                            } else {
                                // need to replace the synonym in the
                                // current gram
                                tokenFoundSequence = true;
                                StringBuffer old = new StringBuffer(newEntry.getLast().toString());
                                old.replace(replace.startOffset() - newEntry.getLast().startOffset(),
                                        replace.endOffset() - newEntry.getLast().startOffset(),
                                        replace.toString());
                                PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
                                newToken.setEmpty().append(old.toString());
                                newToken.setOffset(newEntry.getLast().startOffset(),
                                        newEntry.getLast().endOffset());
                                newEntry.removeLast();
                                newEntry.add(newToken);
                                positionalSynonymSequencesSet.add(newEntry);
                            }
                        }
                    } else if ((newEntry.getLast().startOffset() < replace.startOffset())
                            && (newEntry.getLast().endOffset() < replace.endOffset())) {
                        if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)
                                && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
                            positionalSynonymSequencesSet.add(tokenSequence);
                            newEntry.add(replace);
                            tokenFoundSequence = true;
                        }
                    }
                }
            }
            if (false == tokenFoundSequence) {
                LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
                newEntry.add(replace);
                newAllTokenSequencesSet.add(newEntry);
            }
            // Limit the max number of permutations we consider
            if (newAllTokenSequencesSet.size() > 64) {
                break FOR_FIRST_TOKEN_AT_POSITION_ONLY;
            }
        }
        allTokenSequencesSet = newAllTokenSequencesSet;
        allTokenSequencesSet.addAll(positionalSynonymSequencesSet);

    }

    LinkedList<LinkedList<PackedTokenAttributeImpl>> allTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(
            allTokenSequencesSet);

    // build the unique

    LinkedList<LinkedList<PackedTokenAttributeImpl>> fixedTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>();
    for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequences) {
        LinkedList<PackedTokenAttributeImpl> fixedTokenSequence = new LinkedList<PackedTokenAttributeImpl>();
        fixedTokenSequences.add(fixedTokenSequence);
        PackedTokenAttributeImpl replace = null;
        for (PackedTokenAttributeImpl c : tokenSequence) {
            if (replace == null) {
                StringBuilder prefix = new StringBuilder();
                for (int i = c.startOffset() - 1; i >= 0; i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        prefix.insert(0, test);
                    } else {
                        break;
                    }
                }
                String pre = prefix.toString();
                if (requiresMLTokenDuplication) {
                    String termText = c.toString();
                    int position = termText.indexOf("}");
                    String language = termText.substring(0, position + 1);
                    String token = termText.substring(position + 1);
                    replace = new PackedTokenAttributeImpl();
                    replace.setEmpty().append(language + pre + token);
                    replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                } else {
                    String termText = c.toString();
                    replace = new PackedTokenAttributeImpl();
                    replace.setEmpty().append(pre + termText);
                    replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                }
            } else {
                StringBuilder prefix = new StringBuilder();
                StringBuilder postfix = new StringBuilder();
                StringBuilder builder = prefix;
                for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        builder.insert(0, test);
                    } else {
                        builder = postfix;
                        postfix.setLength(0);
                    }
                }
                String pre = prefix.toString();
                String post = postfix.toString();

                // Does it bridge?
                if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) {
                    String termText = c.toString();
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        @SuppressWarnings("unused")
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = replace.toString();
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(replaceTermText + pre + token);
                        replace.setOffset(replace.startOffset(), c.endOffset());
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    } else {
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = replace.toString();
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(replaceTermText + pre + termText);
                        replace.setOffset(replace.startOffset(), c.endOffset());
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    }
                } else {
                    String termText = c.toString();
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        String replaceTermText = replace.toString();
                        PackedTokenAttributeImpl last = new PackedTokenAttributeImpl();
                        last.setEmpty().append(replaceTermText + post);
                        last.setOffset(replace.startOffset(), replace.endOffset() + post.length());
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(language + pre + token);
                        replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    } else {
                        String replaceTermText = replace.toString();
                        PackedTokenAttributeImpl last = new PackedTokenAttributeImpl();
                        last.setEmpty().append(replaceTermText + post);
                        last.setOffset(replace.startOffset(), replace.endOffset() + post.length());
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new PackedTokenAttributeImpl();
                        replace.setEmpty().append(pre + termText);
                        replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    }
                }
            }
        }
        // finish last
        if (replace != null) {
            StringBuilder postfix = new StringBuilder();
            if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) {
                for (int i = replace.endOffset(); i < testText.length(); i++) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        postfix.append(test);
                    } else {
                        break;
                    }
                }
            }
            String post = postfix.toString();
            int oldPositionIncrement = replace.getPositionIncrement();
            String replaceTermText = replace.toString();
            PackedTokenAttributeImpl terminal = new PackedTokenAttributeImpl();
            terminal.setEmpty().append(replaceTermText + post);
            terminal.setOffset(replace.startOffset(), replace.endOffset() + post.length());
            terminal.setType(replace.type());
            terminal.setPositionIncrement(oldPositionIncrement);
            fixedTokenSequence.add(terminal);
        }
    }

    // rebuild fixed list

    ArrayList<PackedTokenAttributeImpl> fixed = new ArrayList<PackedTokenAttributeImpl>();
    for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
        for (PackedTokenAttributeImpl token : tokenSequence) {
            fixed.add(token);
        }
    }

    // reorder by start position and increment

    Collections.sort(fixed, new Comparator<PackedTokenAttributeImpl>() {

        public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) {
            int dif = o1.startOffset() - o2.startOffset();
            if (dif != 0) {
                return dif;
            } else {
                return o1.getPositionIncrement() - o2.getPositionIncrement();
            }
        }
    });

    // make sure we remove any tokens we have duplicated

    @SuppressWarnings("rawtypes")
    OrderedHashSet unique = new OrderedHashSet();
    unique.addAll(fixed);
    fixed = new ArrayList<PackedTokenAttributeImpl>(unique);

    list = fixed;

    // add any missing locales back to the tokens

    if (localePrefix.length() > 0) {
        for (int j = 0; j < list.size(); j++) {
            PackedTokenAttributeImpl currentToken = list.get(j);
            String termText = currentToken.toString();
            currentToken.setEmpty();
            currentToken.append(localePrefix + termText);
        }
    }

    SchemaField sf = schema.getField(field);

    boolean isShingled = false;
    @SuppressWarnings("resource")
    TokenizerChain tokenizerChain = (sf.getType().getQueryAnalyzer() instanceof TokenizerChain)
            ? ((TokenizerChain) sf.getType().getQueryAnalyzer())
            : null;
    if (tokenizerChain != null) {
        for (TokenFilterFactory factory : tokenizerChain.getTokenFilterFactories()) {
            if (factory instanceof ShingleFilterFactory) {
                isShingled = true;
                break;
            }
        }
    }
    @SuppressWarnings("resource")
    AlfrescoAnalyzerWrapper analyzerWrapper = (sf.getType()
            .getQueryAnalyzer() instanceof AlfrescoAnalyzerWrapper)
                    ? ((AlfrescoAnalyzerWrapper) sf.getType().getQueryAnalyzer())
                    : null;
    if (analyzerWrapper != null) {
        // assume if there are no term positions it is shingled ....
        isShingled = true;
    }

    boolean forceConjuncion = rerankPhase == RerankPhase.QUERY_PHASE;

    if (list.size() == 0) {
        return null;
    } else if (list.size() == 1) {
        nextToken = list.get(0);
        String termText = nextToken.toString();
        if (!isNumeric && (termText.contains("*") || termText.contains("?"))) {
            return newWildcardQuery(new Term(field, termText));
        } else {
            return newTermQuery(new Term(field, termText));
        }
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                Builder q = newBooleanQuery();
                for (int i = 0; i < list.size(); i++) {
                    Query currentQuery;
                    nextToken = list.get(i);
                    String termText = nextToken.toString();
                    if (termText.contains("*") || termText.contains("?")) {
                        currentQuery = newWildcardQuery(new Term(field, termText));
                    } else {
                        currentQuery = newTermQuery(new Term(field, termText));
                    }
                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }
                return q.build();
            } else if (forceConjuncion) {
                BooleanQuery.Builder or = new BooleanQuery.Builder();

                for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
                    BooleanQuery.Builder and = new BooleanQuery.Builder();
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i);
                        String termText = nextToken.toString();

                        Term term = new Term(field, termText);
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            and.add(wildQuery, Occur.MUST);
                        } else {
                            TermQuery termQuery = new TermQuery(term);
                            and.add(termQuery, Occur.MUST);
                        }
                    }
                    if (and.build().clauses().size() > 0) {
                        or.add(and.build(), Occur.SHOULD);
                    }
                }
                return or.build();
            }
            // shingle
            else if (sf.omitPositions() && isShingled) {

                ArrayList<PackedTokenAttributeImpl> nonContained = getNonContained(list);
                Query currentQuery;

                BooleanQuery.Builder weakPhrase = new BooleanQuery.Builder();
                for (PackedTokenAttributeImpl shingleToken : nonContained) {
                    String termText = shingleToken.toString();
                    Term term = new Term(field, termText);

                    if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                        currentQuery = new org.apache.lucene.search.WildcardQuery(term);
                    } else {
                        currentQuery = new TermQuery(term);
                    }
                    weakPhrase.add(currentQuery, Occur.MUST);
                }

                return weakPhrase.build();

            }
            // Word delimiter factory and other odd things generate complex
            // token patterns
            // Smart skip token sequences with small tokens that generate
            // toomany wildcards
            // Fall back to the larger pattern
            // e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches
            // too much (S ite1*) and (Site1*) will still be OK
            // If we skip all (for just 1* in the input) this is still an
            // issue.
            else {

                return generateSpanOrQuery(field, fixedTokenSequences);

            }
        } else {
            if (forceConjuncion) {
                BooleanQuery.Builder or = new BooleanQuery.Builder();

                for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
                    BooleanQuery.Builder and = new BooleanQuery.Builder();
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i);
                        String termText = nextToken.toString();

                        Term term = new Term(field, termText);
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            and.add(wildQuery, Occur.MUST);
                        } else {
                            TermQuery termQuery = new TermQuery(term);
                            and.add(termQuery, Occur.MUST);
                        }
                    }
                    if (and.build().clauses().size() > 0) {
                        or.add(and.build(), Occur.SHOULD);
                    }
                }
                return or.build();
            } else {
                SpanQuery spanQuery = null;
                ArrayList<SpanQuery> atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                int gap = 0;
                for (int i = 0; i < list.size(); i++) {
                    nextToken = list.get(i);
                    String termText = nextToken.toString();
                    Term term = new Term(field, termText);
                    if (getEnablePositionIncrements()) {
                        SpanQuery nextSpanQuery;
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>(
                                    wildQuery);
                            wrapper.setRewriteMethod(
                                    new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit));
                            nextSpanQuery = wrapper;
                        } else {
                            nextSpanQuery = new SpanTermQuery(term);
                        }
                        if (gap == 0) {
                            atSamePositionSpanOrQueryParts.add(nextSpanQuery);
                        } else {
                            if (atSamePositionSpanOrQueryParts.size() == 0) {
                                if (spanQuery == null) {
                                    spanQuery = nextSpanQuery;
                                } else {
                                    spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, nextSpanQuery },
                                            (gap - 1) + internalSlop, internalSlop < 2);
                                }
                                atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                            } else if (atSamePositionSpanOrQueryParts.size() == 1) {
                                if (spanQuery == null) {
                                    spanQuery = atSamePositionSpanOrQueryParts.get(0);
                                } else {
                                    spanQuery = new SpanNearQuery(
                                            new SpanQuery[] { spanQuery,
                                                    atSamePositionSpanOrQueryParts.get(0) },
                                            (gap - 1) + internalSlop, internalSlop < 2);
                                }
                                atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                                atSamePositionSpanOrQueryParts.add(nextSpanQuery);
                            } else {
                                if (spanQuery == null) {
                                    spanQuery = new SpanOrQuery(
                                            atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {}));
                                } else {
                                    spanQuery = new SpanNearQuery(
                                            new SpanQuery[] { spanQuery,
                                                    new SpanOrQuery(atSamePositionSpanOrQueryParts
                                                            .toArray(new SpanQuery[] {})) },
                                            (gap - 1) + internalSlop, internalSlop < 2);
                                }
                                atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
                                atSamePositionSpanOrQueryParts.add(nextSpanQuery);
                            }
                        }
                        gap = nextToken.getPositionIncrement();
                    } else {
                        SpanQuery nextSpanQuery;
                        if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                            org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(
                                    term);
                            SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>(
                                    wildQuery);
                            wrapper.setRewriteMethod(
                                    new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit));
                            nextSpanQuery = wrapper;
                        } else {
                            nextSpanQuery = new SpanTermQuery(term);
                        }
                        if (spanQuery == null) {
                            spanQuery = new SpanOrQuery(nextSpanQuery);
                        } else {
                            spanQuery = new SpanOrQuery(spanQuery, nextSpanQuery);
                        }
                    }
                }
                if (atSamePositionSpanOrQueryParts.size() == 0) {
                    return spanQuery;
                } else if (atSamePositionSpanOrQueryParts.size() == 1) {
                    if (spanQuery == null) {
                        spanQuery = atSamePositionSpanOrQueryParts.get(0);
                    } else {
                        spanQuery = new SpanNearQuery(
                                new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) },
                                (gap - 1) + internalSlop, internalSlop < 2);
                    }
                    return spanQuery;
                } else {
                    if (spanQuery == null) {
                        spanQuery = new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {}));
                    } else {
                        spanQuery = new SpanNearQuery(
                                new SpanQuery[] { spanQuery,
                                        new SpanOrQuery(
                                                atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})) },
                                (gap - 1) + internalSlop, internalSlop < 2);
                    }
                    return spanQuery;
                }
            }
        }
    }
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

private String getFirstTokenForRange(String string, FieldInstance field) throws IOException {
    PackedTokenAttributeImpl nextToken;/*from w ww .  ja v  a  2 s.  co m*/
    TokenStream source = null;
    ;

    try {
        source = getAnalyzer().tokenStream(field.getField(), new StringReader(string));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }

            return nextToken.toString();
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }
    return null;
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

/**
 * @param first//from w w w . j  a va  2 s  .c om
 * @param field
 * @return SpanOrQuery
 * @throws IOException
 */
private SpanQuery buildSpanOrQuery(String first, FieldInstance field) throws IOException {
    ArrayList<SpanQuery> spanOrQueryParts = new ArrayList<SpanQuery>();

    PackedTokenAttributeImpl nextToken;
    TokenStream source = null;

    try {
        source = getAnalyzer().tokenStream(field.getField(), new StringReader(first));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }

            SpanQuery termQuery = new SpanTermQuery(new Term(field.getField(), nextToken.toString()));
            spanOrQueryParts.add(termQuery);
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }

    if (spanOrQueryParts.size() == 1) {
        return spanOrQueryParts.get(0);
    } else {
        return new SpanOrQuery(spanOrQueryParts.toArray(new SpanQuery[] {}));
    }
}

From source file:org.alfresco.solr.SolrInformationServer.java

License:Open Source License

private void addContentPropertyToDocUsingAlfrescoRepository(SolrInputDocument doc, QName propertyQName,
        long dbId, String locale) throws AuthenticationException, IOException {
    long start = System.nanoTime();

    // Expensive call to be done with ContentTracker
    GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null);

    addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_STATUS,
            response);/*from   w  ww.  j  av a2  s  .c o m*/
    addContentPropertyMetadata(doc, propertyQName,
            AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_EXCEPTION, response);
    addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_TIME,
            response);

    InputStream ris = response.getContent();
    String textContent = "";
    try {
        if (ris != null) {
            // Get and copy content
            byte[] bytes = FileCopyUtils.copyToByteArray(new BoundedInputStream(ris, contentStreamLimit));
            textContent = new String(bytes, StandardCharsets.UTF_8);
        }
    } finally {
        // release the response only when the content has been read
        response.release();
    }

    if (minHash && textContent.length() > 0) {
        Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
        TokenStream ts = analyzer.tokenStream("min_hash", textContent);
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            StringBuilder tokenBuff = new StringBuilder();
            char[] buff = termAttribute.buffer();

            for (int i = 0; i < termAttribute.length(); i++) {
                tokenBuff.append(Integer.toHexString(buff[i]));
            }
            doc.addField(FINGERPRINT_FIELD, tokenBuff.toString());

        }
        ts.end();
        ts.close();
    }

    long end = System.nanoTime();
    this.getTrackerStats().addDocTransformationTime(end - start);

    StringBuilder builder = new StringBuilder(textContent.length() + 16);
    builder.append("\u0000").append(locale).append("\u0000");
    builder.append(textContent);
    String localisedText = builder.toString();

    for (FieldInstance field : AlfrescoSolrDataModel.getInstance()
            .getIndexedFieldNamesForProperty(propertyQName).getFields()) {
        doc.removeField(field.getField());
        if (field.isLocalised()) {
            doc.addField(field.getField(), localisedText);
        } else {
            doc.addField(field.getField(), textContent);
        }
        addFieldIfNotSet(doc, field);
    }
}

From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java

License:Apache License

protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
    TokenStream source;

    if (analyzerIn == null)
        analyzerIn = analyzer;/*w w  w  .  jav  a2 s. c  om*/

    try {
        source = analyzerIn.tokenStream(field, new StringReader(part));
        source.reset();
    } catch (IOException e) {
        throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
    }

    TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
    BytesRef bytes = termAtt.getBytesRef();

    try {
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
    } catch (IOException e) {
        throw new RuntimeException("error analyzing range part: " + part, e);
    }

    try {
        source.end();
        source.close();
    } catch (IOException e) {
        throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part,
                e);
    }

    return BytesRef.deepCopyOf(bytes);
}

From source file:org.apache.jackrabbit.core.query.lucene.AbstractExcerpt.java

License:Apache License

/**
 * @param text the text.//  w  w  w .ja  va 2s  . co m
 * @return a <code>TermPositionVector</code> for the given text.
 */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            TermAttribute term = ts.getAttribute(TermAttribute.class);
            String termText = term.term();
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        // should never happen, we are reading from a string
    }

    return new TermPositionVector() {

        private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}

From source file:org.apache.jackrabbit.core.query.lucene.AbstractIndex.java

License:Apache License

/**
 * Returns a document that is finished with text extraction and is ready to
 * be added to the index./*from   w  w  w .j  av  a2 s. c  om*/
 *
 * @param doc the document to check.
 * @return <code>doc</code> if it is finished already or a stripped down
 *         copy of <code>doc</code> without text extractors.
 * @throws IOException if the document cannot be added to the indexing
 *                     queue.
 */
private Document getFinishedDocument(Document doc) throws IOException {
    if (!Util.isDocumentReady(doc)) {
        Document copy = new Document();
        // mark the document that reindexing is required
        copy.add(new Field(FieldNames.REINDEXING_REQUIRED, false, "", Field.Store.NO,
                Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
        for (Fieldable f : doc.getFields()) {
            Fieldable field = null;
            Field.TermVector tv = getTermVectorParameter(f);
            Field.Store stored = f.isStored() ? Field.Store.YES : Field.Store.NO;
            Field.Index indexed = getIndexParameter(f);
            if (f instanceof LazyTextExtractorField || f.readerValue() != null) {
                // replace all readers with empty string reader
                field = new Field(f.name(), new StringReader(""), tv);
            } else if (f.stringValue() != null) {
                field = new Field(f.name(), false, f.stringValue(), stored, indexed, tv);
            } else if (f.isBinary()) {
                field = new Field(f.name(), f.getBinaryValue(), stored);
            } else if (f.tokenStreamValue() != null && f.tokenStreamValue() instanceof SingletonTokenStream) {
                TokenStream tokenStream = f.tokenStreamValue();
                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class);
                tokenStream.incrementToken();
                String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength());
                tokenStream.reset();
                field = new Field(f.name(),
                        new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone()));
            }
            if (field != null) {
                field.setOmitNorms(f.getOmitNorms());
                copy.add(field);
            }
        }
        // schedule the original document for later indexing
        Document existing = indexingQueue.addDocument(doc);
        if (existing != null) {
            // the queue already contained a pending document for this
            // node. -> dispose the document
            Util.disposeDocument(existing);
        }
        // use the stripped down copy for now
        doc = copy;
    }
    return doc;
}

From source file:org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser.java

License:Apache License

/**
 * {@inheritDoc}/*from w  w w  .  java  2s .com*/
 */
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
    // only create a prefix query when the term is a single word / token
    Analyzer a = getAnalyzer();
    TokenStream ts = a.tokenStream(field, new StringReader(termStr));
    int count = 0;
    boolean isCJ = false;
    try {
        TypeAttribute t = ts.addAttribute(TypeAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            count++;
            isCJ = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ].equals(t.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new ParseException(e.getMessage());
    } finally {
        try {
            ts.close();
        } catch (IOException e) {
            // ignore
        }
    }
    if (count > 1 && isCJ) {
        return getFieldQuery(field, termStr);
    } else {
        return getWildcardQuery(field, termStr + "*");
    }
}

From source file:org.apache.jackrabbit.core.query.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *///from  w w w  .  j  a va  2s .co m
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    while (ts.incrementToken()) {
        TermAttribute term = ts.getAttribute(TermAttribute.class);
        String word = term.term();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:org.apache.jackrabbit.core.query.lucene.SearchIndex.java

License:Apache License

/**
 * Merges the fulltext indexed fields of the aggregated node states into
 * <code>doc</code>.//from ww  w.  ja va 2 s. co m
 *
 * @param state the node state on which <code>doc</code> was created.
 * @param doc the lucene document with index fields from <code>state</code>.
 * @param ifv the current index format version.
 */
protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) {
    if (indexingConfig != null) {
        AggregateRule[] aggregateRules = indexingConfig.getAggregateRules();
        if (aggregateRules == null) {
            return;
        }
        try {
            ItemStateManager ism = getContext().getItemStateManager();
            for (AggregateRule aggregateRule : aggregateRules) {
                boolean ruleMatched = false;
                // node includes
                NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state);
                if (aggregates != null) {
                    ruleMatched = true;
                    for (NodeState aggregate : aggregates) {
                        Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv);
                        // transfer fields to doc if there are any
                        Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT);
                        if (fulltextFields != null) {
                            for (Fieldable fulltextField : fulltextFields) {
                                doc.add(fulltextField);
                            }
                            doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                    aggregate.getNodeId().toString(), Field.Store.NO,
                                    Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                        }
                    }
                    // make sure that fulltext fields are aligned properly
                    // first all stored fields, then remaining
                    Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT);
                    doc.removeFields(FieldNames.FULLTEXT);
                    Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED);
                    for (Fieldable f : fulltextFields) {
                        doc.add(f);
                    }
                }
                // property includes
                PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state);
                if (propStates != null) {
                    ruleMatched = true;
                    for (PropertyState propState : propStates) {
                        String namePrefix = FieldNames.createNamedValue(
                                getNamespaceMappings().translateName(propState.getName()), "");
                        NodeState parent = (NodeState) ism.getItemState(propState.getParentId());
                        Document aDoc = createDocument(parent, getNamespaceMappings(), ifv);
                        try {
                            // find the right fields to transfer
                            Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES);
                            for (Fieldable field : fields) {

                                // assume properties fields use SingleTokenStream
                                TokenStream tokenStream = field.tokenStreamValue();
                                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                                PayloadAttribute payloadAttribute = tokenStream
                                        .addAttribute(PayloadAttribute.class);
                                tokenStream.incrementToken();
                                tokenStream.end();
                                tokenStream.close();

                                String value = new String(termAttribute.termBuffer(), 0,
                                        termAttribute.termLength());
                                if (value.startsWith(namePrefix)) {
                                    // extract value
                                    String rawValue = value.substring(namePrefix.length());
                                    // create new named value
                                    Path p = getRelativePath(state, propState);
                                    String path = getNamespaceMappings().translatePath(p);
                                    value = FieldNames.createNamedValue(path, rawValue);
                                    termAttribute.setTermBuffer(value);
                                    PropertyMetaData pdm = PropertyMetaData
                                            .fromByteArray(payloadAttribute.getPayload().getData());
                                    doc.add(new Field(field.name(),
                                            new SingletonTokenStream(value, pdm.getPropertyType())));
                                    doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                            parent.getNodeId().toString(), Field.Store.NO,
                                            Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                                    if (pdm.getPropertyType() == PropertyType.STRING) {
                                        // add to fulltext index
                                        Field ft = new Field(FieldNames.FULLTEXT, false, rawValue,
                                                Field.Store.YES, Field.Index.ANALYZED_NO_NORMS,
                                                Field.TermVector.NO);
                                        doc.add(ft);
                                    }
                                }
                            }
                        } finally {
                            Util.disposeDocument(aDoc);
                        }
                    }
                }

                // only use first aggregate definition that matches
                if (ruleMatched) {
                    break;
                }
            }
        } catch (NoSuchItemStateException e) {
            // do not fail if aggregate cannot be created
            log.info("Exception while building indexing aggregate for {}. Node is not available {}.",
                    state.getNodeId(), e.getMessage());
        } catch (Exception e) {
            // do not fail if aggregate cannot be created
            log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e);
        }
    }
}