Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:ucas.IKAnalzyerDemo.java

License:Apache License

public static String Spilt2Words(String content) {
    String resString = "";
    //IK?smart??/*from  w w  w  .  ja v a  2s  . c  om*/
    Analyzer analyzer = new IKAnalyzer(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        //myfield??
        ts = analyzer.tokenStream("myfield", new StringReader(content));
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            resString += term.toString() + "|";
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    return resString;
}

From source file:uk.co.flax.luwak.analysis.TestSuffixingNGramTokenizer.java

License:Apache License

public static void main(String... args) throws IOException {

    String text = Files.toString(new File("src/test/resources/gutenberg/README"), Charsets.UTF_8);
    DocumentBatch batch = DocumentBatch/*www  .j  a  v  a  2  s  .  co  m*/
            .of(InputDocument.builder("1").addField("f", text, new StandardAnalyzer()).build());

    for (int i = 0; i < 50; i++) {

        long time = System.currentTimeMillis();

        // Cannot use try-with-resources here as we assign to ts in the block.
        LeafReader reader = batch.getIndexReader();
        TokenStream ts = new TermsEnumTokenStream(reader.fields().terms("f").iterator());
        try {
            ts = new SuffixingNGramTokenFilter(ts, "XX", "__WILDCARD__", 20);
            //ts = new DuplicateRemovalTokenFilter(ts);
            int tokencount = 0;
            ts.reset();
            while (ts.incrementToken()) {
                tokencount++;
            }

            System.out.println(tokencount + " tokens in " + (System.currentTimeMillis() - time) + " ms");
        } finally {
            ts.close();
        }
    }

}

From source file:uk.co.nickthecoder.pinkwino.metadata.LuceneMetaData.java

License:Open Source License

public String analyzeWord(String word) {
    Reader reader = new StringReader(word);
    TokenStream tokenStream = null;
    try {//from w  ww .  j  ava2  s  .  c  o  m
        tokenStream = _analyzer.tokenStream("content", reader);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            return charTermAttribute.toString();
        }
    } catch (Exception e) {
        _logger.error("Failed to filter a keyword. " + e);
    } finally {
        try {
            if (tokenStream != null) {
                tokenStream.end();
                tokenStream.close();
            }
            reader.close();
        } catch (Exception e) {
            // Do nothing
            _logger.error("Failed to close during analyzeWord " + e);
        }
    }
    return null;
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java

License:Mozilla Public License

public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[],
        int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
        Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);//  w  ww  .ja v  a 2  s .  co m
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);

    CharTermAttribute termAtt = null;
    if (output.length > 0) {
        assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
        termAtt = ts.getAttribute(CharTermAttribute.class);
    }

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
        assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
        offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
        assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
        typeAtt = ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
        assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
        assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
        posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
        assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
        keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }

    // Maps position to the start/end offset:
    final Map<Integer, Integer> posToStartOffset = new HashMap<>();
    final Map<Integer, Integer> posToEndOffset = new HashMap<>();

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
        // extra safety to enforce, that the state is not preserved and also
        // assign bogus values
        ts.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        if (offsetAtt != null)
            offsetAtt.setOffset(14584724, 24683243);
        if (typeAtt != null)
            typeAtt.setType("bogusType");
        if (posIncrAtt != null)
            posIncrAtt.setPositionIncrement(45987657);
        if (posLengthAtt != null)
            posLengthAtt.setPositionLength(45987653);
        if (keywordAtt != null)
            keywordAtt.setKeyword((i & 1) == 0);

        checkClearAtt.getAndResetClearCalled(); // reset it, because we
        // called clearAttribute()
        // before
        assertTrue("token " + i + " does not exist", ts.incrementToken());
        assertTrue("clearAttributes() was not called correctly in TokenStream chain",
                checkClearAtt.getAndResetClearCalled());

        assertEquals("term " + i, output[i], termAtt.toString());
        if (startOffsets != null) {
            assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset());
        }
        if (endOffsets != null) {
            assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset());
        }
        if (types != null) {
            assertEquals("type " + i, types[i], typeAtt.type());
        }
        if (posIncrements != null) {
            assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement());
        }
        if (posLengths != null) {
            assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength());
        }
        if (keywordAtts != null) {
            assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
        }

        // we can enforce some basic things about a few attributes even if
        // the caller doesn't check:
        if (offsetAtt != null) {
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            if (finalOffset != null) {
                assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
                assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset="
                        + finalOffset.intValue(), endOffset <= finalOffset.intValue());
            }

            if (offsetsAreCorrect) {
                assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset="
                        + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
                lastStartOffset = offsetAtt.startOffset();
            }

            if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
                // Validate offset consistency in the graph, ie
                // all tokens leaving from a certain pos have the
                // same startOffset, and all tokens arriving to a
                // certain pos have the same endOffset:
                final int posInc = posIncrAtt.getPositionIncrement();
                pos += posInc;

                final int posLength = posLengthAtt.getPositionLength();

                if (!posToStartOffset.containsKey(pos)) {
                    // First time we've seen a token leaving from this
                    // position:
                    posToStartOffset.put(pos, startOffset);
                    // System.out.println("  + s " + pos + " -> " +
                    // startOffset);
                } else {
                    // We've seen a token leaving from this position
                    // before; verify the startOffset is the same:
                    // System.out.println("  + vs " + pos + " -> " +
                    // startOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToStartOffset.get(pos).intValue(), startOffset);
                }

                final int endPos = pos + posLength;

                if (!posToEndOffset.containsKey(endPos)) {
                    // First time we've seen a token arriving to this
                    // position:
                    posToEndOffset.put(endPos, endOffset);
                    // System.out.println("  + e " + endPos + " -> " +
                    // endOffset);
                } else {
                    // We've seen a token arriving to this position
                    // before; verify the endOffset is the same:
                    // System.out.println("  + ve " + endPos + " -> " +
                    // endOffset);
                    assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt,
                            posToEndOffset.get(endPos).intValue(), endOffset);
                }
            }
        }
        if (posIncrAtt != null) {
            if (i == 0) {
                assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
            } else {
                assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
            }
        }
        if (posLengthAtt != null) {
            assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
        }
    }

    if (ts.incrementToken()) {
        fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token="
                + termAtt.toString());
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null)
        termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null)
        offsetAtt.setOffset(14584724, 24683243);
    if (typeAtt != null)
        typeAtt.setType("bogusType");
    if (posIncrAtt != null)
        posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null)
        posLengthAtt.setPositionLength(45987653);

    checkClearAtt.getAndResetClearCalled(); // reset it, because we called
    // clearAttribute() before

    ts.end();
    assertTrue("super.end()/clearAttributes() was not called correctly in end()",
            checkClearAtt.getAndResetClearCalled());

    if (finalOffset != null) {
        assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
        assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
        assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
}

From source file:uoc.dedup.document.fingerprintCharikar.java

License:Open Source License

/**
 * Calculate the fingerprint./*from ww  w . j a v  a2 s . c  o  m*/
 * Splt the text in shingles and with each token generate a hashrabin, with the result
 * we generate a final fingerprint vector.
 * @return fingerprint in a string
 */
public String calculateFingerprint() {
    totalTokens = 0;
    totalnGramTokens = 0;
    TokenStream tk = null;

    if (this.useStemming()) {
        this.analyzer = analyzerCache.newAnalyzer(this.language);
        tk = this.analyzer.tokenStream("fingerprint", reader);
    } else {
        tk = new StandardTokenizer(reader);
    }
    ShingleMatrixFilter tokens = new ShingleMatrixFilter(tk, 1, this.getMAXGRAMS(), new Character(' '));

    //Put the tokens in a map and select the most important terms.
    try {
        while (true) {
            Token token = tokens.next();
            if (token == null) {
                break;
            }
            int numtokens = token.term().split(" ").length;
            if (numtokens == 1) {
                this.add(token.term(), this.m); //Add a token to the list of frequencies tokens                                        
                //System.out.println(token.term());
                totalTokens++;
            } else if (numtokens >= this.MIMGRAMS) {
                //System.out.println(token.term());
                this.add(token.term(), this.nGrams);
                totalnGramTokens++; //Count the ngram tokens            
            }
        }
        tokens.close();
        this.createTopTerms(this.m, this.getTokensTop(), this.totalTokens);
        //Calculate the fingerprint vector
        this.calculateVectorFingerprint(this.nGrams, this.totalnGramTokens);
        tk.close();
    } catch (IOException e) {
        System.out.println("Error getTokens: " + e.getMessage());
    }
    vFingerprint = this.simHash.getFingerprint();
    this.fingerprint2String();
    return this.getFingerprint();
}

From source file:varaha.text.TokenizeText.java

License:Apache License

/**
   Fills a DataBag with tokens from a TokenStream
 *//*from   ww  w. j a va 2  s.  c om*/
public DataBag fillBag(TokenStream stream) throws IOException {
    DataBag result = bagFactory.newDefaultBag();
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    try {
        stream.reset();
        while (stream.incrementToken()) {
            if (termAttribute.length() > 0) {
                Tuple termText = tupleFactory.newTuple(termAttribute.toString());
                result.add(termText);
            }
        }
        stream.end();
    } finally {
        stream.close();
    }
    return result;
}

From source file:webdocs.WebDocAnalyzer.java

String preprocessText(String html, boolean title) throws IOException {

    int freqCutoffThreshold = title ? 1 : this.freqCutoffThreshold;

    HashMap<String, Integer> tfMap = new HashMap<>();

    StringBuffer buff = new StringBuffer();
    CharArraySet stopList = StopFilter.makeStopSet(Version.LUCENE_4_9, indexer.buildStopwordList("stopfile"));

    Analyzer webdocAnalyzer = new WebDocAnalyzer(indexer.getProperties(), stopList);
    TokenStream stream = webdocAnalyzer.tokenStream("field", new StringReader(html));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    stream.reset();//from  w  w w  .ja v a2  s.  c om
    while (stream.incrementToken()) {
        String token = termAtt.toString();
        Integer tf = tfMap.get(token);
        if (tf == null) {
            tf = new Integer(0);
        }
        tf++;
        tfMap.put(token, tf);
    }

    stream.end();
    stream.close();

    for (Map.Entry<String, Integer> e : tfMap.entrySet()) {
        String word = e.getKey();
        int tf = e.getValue();
        if (tf >= freqCutoffThreshold) {
            for (int i = 0; i < tf; i++) { // print this word tf times... word order doesn't matter!
                buff.append(word).append(" ");
            }
        }
    }
    return buff.toString();
}

From source file:workTextIndexService.Procesamiento.java

public String normalizar(String texto) {
    resultN = "";
    @SuppressWarnings("deprecation")
    SpanishAnalyzer analyzer = new SpanishAnalyzer(Version.LUCENE_4_10_1);
    try {/*from   www. ja  v a 2 s .com*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(texto));
        stream.reset();
        while (stream.incrementToken()) {
            resultN = resultN + (stream.getAttribute(CharTermAttribute.class).toString()) + " ";
        }
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return resultN.toLowerCase();
}

From source file:wt10g.WTDocument.java

String preProcess(String text) throws Exception {

    StringBuffer tokenizedContentBuff = new StringBuffer();

    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/* w w w .j a  v a 2  s.  c o  m*/

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();
    return tokenizedContentBuff.toString();
}

From source file:yasoco.TermScore.java

Query constructQuery(int docId) throws Exception {
    Query q = null;//from w w  w  .j  av  a  2  s  . c  o m
    boolean formSelectiveQueries = Boolean.parseBoolean(prop.getProperty("toptermquery", "true"));
    /* MoreLikeThis not woking for some reason!
    if (formSelectiveQueries) {   
       q = mlt.like(docId);
       return q;
    }
    */

    Document queryDoc = reader.document(docId);
    q = new BooleanQuery();
    int termCount = 0;
    TokenStream fs = null;

    List<IndexableField> fields = queryDoc.getFields();

    for (IndexableField field : fields) {
        String fieldName = field.name();
        if (fieldName.equals(JavaSCTree.FIELD_DOCNAME) || fieldName.equals(JavaSCTree.FIELD_SC))
            continue; // ignore non-searchable fields

        if (formSelectiveQueries) {
            List<TermScore> topList = selTerms(docId, field.name(), q);
            for (TermScore ts : topList) {
                Term thisTerm = new Term(field.name(), ts.term);
                ((BooleanQuery) q).add(new TermQuery(thisTerm), BooleanClause.Occur.SHOULD);
            }
        } else {
            fs = queryDoc.getField(fieldName).tokenStream(analyzer);
            CharTermAttribute termAtt = fs.addAttribute(CharTermAttribute.class);
            fs.reset();

            // print all tokens until stream is exhausted
            while (fs.incrementToken()) {
                Term thisTerm = new Term(field.name(), termAtt.toString());
                termCount++;
                if (termCount == maxlimit) {
                    maxlimit = maxlimit << 1;
                    BooleanQuery.setMaxClauseCount(maxlimit);
                }
                ((BooleanQuery) q).add(new TermQuery(thisTerm), BooleanClause.Occur.SHOULD);
            }
            fs.end();
            fs.close();
        }
    }
    return q;
}