Example usage for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

private ArrayList<String> getTokens(IndexableField indexableField) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();

    TokenStream ts = indexableField.tokenStream(schema.getIndexAnalyzer(), null);
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    ts.reset();/*from  w w  w.j a v a2s.c o m*/
    while (ts.incrementToken()) {
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();

    return tokens;
}

From source file:org.alfresco.solr.SolrInformationServer.java

License:Open Source License

private void addContentPropertyToDocUsingAlfrescoRepository(SolrInputDocument doc, QName propertyQName,
        long dbId, String locale) throws AuthenticationException, IOException {
    long start = System.nanoTime();

    // Expensive call to be done with ContentTracker
    GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null);

    addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_STATUS,
            response);// w  ww.  j  a  v a2s.  co  m
    addContentPropertyMetadata(doc, propertyQName,
            AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_EXCEPTION, response);
    addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_TIME,
            response);

    InputStream ris = response.getContent();
    String textContent = "";
    try {
        if (ris != null) {
            // Get and copy content
            byte[] bytes = FileCopyUtils.copyToByteArray(new BoundedInputStream(ris, contentStreamLimit));
            textContent = new String(bytes, StandardCharsets.UTF_8);
        }
    } finally {
        // release the response only when the content has been read
        response.release();
    }

    if (minHash && textContent.length() > 0) {
        Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
        TokenStream ts = analyzer.tokenStream("min_hash", textContent);
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            StringBuilder tokenBuff = new StringBuilder();
            char[] buff = termAttribute.buffer();

            for (int i = 0; i < termAttribute.length(); i++) {
                tokenBuff.append(Integer.toHexString(buff[i]));
            }
            doc.addField(FINGERPRINT_FIELD, tokenBuff.toString());

        }
        ts.end();
        ts.close();
    }

    long end = System.nanoTime();
    this.getTrackerStats().addDocTransformationTime(end - start);

    StringBuilder builder = new StringBuilder(textContent.length() + 16);
    builder.append("\u0000").append(locale).append("\u0000");
    builder.append(textContent);
    String localisedText = builder.toString();

    for (FieldInstance field : AlfrescoSolrDataModel.getInstance()
            .getIndexedFieldNamesForProperty(propertyQName).getFields()) {
        doc.removeField(field.getField());
        if (field.isLocalised()) {
            doc.addField(field.getField(), localisedText);
        } else {
            doc.addField(field.getField(), textContent);
        }
        addFieldIfNotSet(doc, field);
    }
}

From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java

License:Apache License

protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
    TokenStream source;

    if (analyzerIn == null)
        analyzerIn = analyzer;//ww w . j  a v a2s . c o  m

    try {
        source = analyzerIn.tokenStream(field, new StringReader(part));
        source.reset();
    } catch (IOException e) {
        throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
    }

    TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
    BytesRef bytes = termAtt.getBytesRef();

    try {
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
    } catch (IOException e) {
        throw new RuntimeException("error analyzing range part: " + part, e);
    }

    try {
        source.end();
        source.close();
    } catch (IOException e) {
        throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part,
                e);
    }

    return BytesRef.deepCopyOf(bytes);
}

From source file:org.apache.jackrabbit.core.query.lucene.AbstractExcerpt.java

License:Apache License

/**
 * @param text the text./*from w w  w .j  a  v  a  2s. c  o  m*/
 * @return a <code>TermPositionVector</code> for the given text.
 */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            TermAttribute term = ts.getAttribute(TermAttribute.class);
            String termText = term.term();
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        // should never happen, we are reading from a string
    }

    return new TermPositionVector() {

        private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}

From source file:org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser.java

License:Apache License

/**
 * {@inheritDoc}//from w w  w  .  j a va  2  s .co m
 */
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
    // only create a prefix query when the term is a single word / token
    Analyzer a = getAnalyzer();
    TokenStream ts = a.tokenStream(field, new StringReader(termStr));
    int count = 0;
    boolean isCJ = false;
    try {
        TypeAttribute t = ts.addAttribute(TypeAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            count++;
            isCJ = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ].equals(t.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new ParseException(e.getMessage());
    } finally {
        try {
            ts.close();
        } catch (IOException e) {
            // ignore
        }
    }
    if (count > 1 && isCJ) {
        return getFieldQuery(field, termStr);
    } else {
        return getWildcardQuery(field, termStr + "*");
    }
}

From source file:org.apache.jackrabbit.core.query.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*from   ww  w.j  a  va2 s.  c o  m*/
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    while (ts.incrementToken()) {
        TermAttribute term = ts.getAttribute(TermAttribute.class);
        String word = term.term();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:org.apache.jackrabbit.core.query.lucene.SearchIndex.java

License:Apache License

/**
 * Merges the fulltext indexed fields of the aggregated node states into
 * <code>doc</code>./*from   w ww  . j  a  v  a  2 s.  c  om*/
 *
 * @param state the node state on which <code>doc</code> was created.
 * @param doc the lucene document with index fields from <code>state</code>.
 * @param ifv the current index format version.
 */
protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) {
    if (indexingConfig != null) {
        AggregateRule[] aggregateRules = indexingConfig.getAggregateRules();
        if (aggregateRules == null) {
            return;
        }
        try {
            ItemStateManager ism = getContext().getItemStateManager();
            for (AggregateRule aggregateRule : aggregateRules) {
                boolean ruleMatched = false;
                // node includes
                NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state);
                if (aggregates != null) {
                    ruleMatched = true;
                    for (NodeState aggregate : aggregates) {
                        Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv);
                        // transfer fields to doc if there are any
                        Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT);
                        if (fulltextFields != null) {
                            for (Fieldable fulltextField : fulltextFields) {
                                doc.add(fulltextField);
                            }
                            doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                    aggregate.getNodeId().toString(), Field.Store.NO,
                                    Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                        }
                    }
                    // make sure that fulltext fields are aligned properly
                    // first all stored fields, then remaining
                    Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT);
                    doc.removeFields(FieldNames.FULLTEXT);
                    Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED);
                    for (Fieldable f : fulltextFields) {
                        doc.add(f);
                    }
                }
                // property includes
                PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state);
                if (propStates != null) {
                    ruleMatched = true;
                    for (PropertyState propState : propStates) {
                        String namePrefix = FieldNames.createNamedValue(
                                getNamespaceMappings().translateName(propState.getName()), "");
                        NodeState parent = (NodeState) ism.getItemState(propState.getParentId());
                        Document aDoc = createDocument(parent, getNamespaceMappings(), ifv);
                        try {
                            // find the right fields to transfer
                            Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES);
                            for (Fieldable field : fields) {

                                // assume properties fields use SingleTokenStream
                                TokenStream tokenStream = field.tokenStreamValue();
                                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                                PayloadAttribute payloadAttribute = tokenStream
                                        .addAttribute(PayloadAttribute.class);
                                tokenStream.incrementToken();
                                tokenStream.end();
                                tokenStream.close();

                                String value = new String(termAttribute.termBuffer(), 0,
                                        termAttribute.termLength());
                                if (value.startsWith(namePrefix)) {
                                    // extract value
                                    String rawValue = value.substring(namePrefix.length());
                                    // create new named value
                                    Path p = getRelativePath(state, propState);
                                    String path = getNamespaceMappings().translatePath(p);
                                    value = FieldNames.createNamedValue(path, rawValue);
                                    termAttribute.setTermBuffer(value);
                                    PropertyMetaData pdm = PropertyMetaData
                                            .fromByteArray(payloadAttribute.getPayload().getData());
                                    doc.add(new Field(field.name(),
                                            new SingletonTokenStream(value, pdm.getPropertyType())));
                                    doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                            parent.getNodeId().toString(), Field.Store.NO,
                                            Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                                    if (pdm.getPropertyType() == PropertyType.STRING) {
                                        // add to fulltext index
                                        Field ft = new Field(FieldNames.FULLTEXT, false, rawValue,
                                                Field.Store.YES, Field.Index.ANALYZED_NO_NORMS,
                                                Field.TermVector.NO);
                                        doc.add(ft);
                                    }
                                }
                            }
                        } finally {
                            Util.disposeDocument(aDoc);
                        }
                    }
                }

                // only use first aggregate definition that matches
                if (ruleMatched) {
                    break;
                }
            }
        } catch (NoSuchItemStateException e) {
            // do not fail if aggregate cannot be created
            log.info("Exception while building indexing aggregate for {}. Node is not available {}.",
                    state.getNodeId(), e.getMessage());
        } catch (Exception e) {
            // do not fail if aggregate cannot be created
            log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e);
        }
    }
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java

License:Apache License

/**
 * Tries to merge back tokens that are split on relevant fulltext query
 * wildcards ('*' or '?')//from   ww w .  j a  v  a 2 s  .c  o m
 *
 *
 * @param text
 * @param analyzer
 * @return
 */
static List<String> tokenize(String text, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        // TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        stream.reset();

        int poz = 0;
        boolean hasFulltextToken = false;
        StringBuilder token = new StringBuilder();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            int start = offsetAtt.startOffset();
            int end = offsetAtt.endOffset();
            if (start > poz) {
                for (int i = poz; i < start; i++) {
                    for (char c : fulltextTokens) {
                        if (c == text.charAt(i)) {
                            token.append(c);
                            hasFulltextToken = true;
                        }
                    }
                }
            }
            poz = end;
            if (hasFulltextToken) {
                token.append(term);
                hasFulltextToken = false;
            } else {
                if (token.length() > 0) {
                    tokens.add(token.toString());
                }
                token = new StringBuilder();
                token.append(term);
            }
        }
        // consume to the end of the string
        if (poz < text.length()) {
            for (int i = poz; i < text.length(); i++) {
                for (char c : fulltextTokens) {
                    if (c == text.charAt(i)) {
                        token.append(c);
                    }
                }
            }
        }
        if (token.length() > 0) {
            tokens.add(token.toString());
        }
        stream.end();
    } catch (IOException e) {
        LOG.error("Building fulltext query failed", e.getMessage());
        return null;
    } finally {
        try {
            if (stream != null) {
                stream.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }
    return tokens;
}

From source file:org.apache.mahout.classifier.NewsgroupHelper.java

License:Apache License

public static void countWords(Analyzer analyzer, Collection<String> words, Reader in,
        Multiset<String> overallCounts) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*from  w  w  w  . j a  va2s.  c om*/
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
    ts.end();
    Closeables.close(ts, true);
}

From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java

License:Apache License

@Test
public void testAnalysis() throws Exception {
    Analyzer analyzer = new MailArchivesClusteringAnalyzer();

    String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n"
            + "Mahout is a scalable, machine-learning LIBRARY\n"
            + "we've added some additional stopwords such as html, mailto, regards\t"
            + "apache_hadoop provides the foundation for scalability\n"
            + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package";
    Reader reader = new StringReader(text);

    // if you change the text above, then you may need to change this as well
    // order matters too
    String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad",
            "stopword", "apache_hadoop", "provid", "foundat", "scalabl" };

    TokenStream tokenStream = analyzer.tokenStream("test", reader);
    assertNotNull(tokenStream);//from ww  w.  ja va  2 s  . c o m
    tokenStream.reset();
    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    int e = 0;
    while (tokenStream.incrementToken() && e < expectedTokens.length) {
        assertEquals(expectedTokens[e++], termAtt.toString());
    }
    assertEquals(e, expectedTokens.length);
    tokenStream.end();
    tokenStream.close();
}