Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.wltea.analyzer.test.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?smart??/*from   w w  w  .j ava2 s . c o  m*/
    Analyzer analyzer = new IKAnalyzer4PinYin(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.zenoss.zep.index.impl.lucene.LuceneQueryBuilder.java

License:Open Source License

/**
 * Tokenizes the given query using the same behavior as when the field is analyzed.
 *
 * @param fieldName The field name in the index.
 * @param analyzer  The analyzer to use to tokenize the query.
 * @param query     The query to tokenize.
 * @return The tokens from the query./*from   w  w w. java2  s.co  m*/
 * @throws ZepException If an exception occur.
 */
private static List<String> getTokens(String fieldName, Analyzer analyzer, String query) throws ZepException {
    final List<String> tokens = new ArrayList<String>();
    try {
        TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(query));
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        try {
            ts.reset();
            while (ts.incrementToken()) {
                tokens.add(term.toString());
            }
            ts.end();
        } catch (IOException e) {
            throw new ZepException(e.getLocalizedMessage(), e);
        } finally {
            ts.close();
        }
    } catch (IOException e) {
        throw new ZepException(e.getLocalizedMessage(), e);
    }
    return tokens;
}

From source file:perf.TestAnalyzerPerf.java

License:Apache License

private static void testAnalyzer(String desc, File wikiLinesFile, Analyzer a, int warmupCount, int runCount)
        throws Exception {
    System.out.println("\nTEST: " + desc);

    // 64 KB buffer
    InputStream is = new FileInputStream(wikiLinesFile);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16);

    long startTime = System.currentTimeMillis();
    long sumTime = 0;
    long hash = 0;
    long tokenCount = 0;
    int totCount = warmupCount + runCount;
    for (int i = 0; i < totCount; i++) {

        boolean isWarmup = i < warmupCount;

        if (i % 10000 == 0) {
            System.out.println(String.format(Locale.ROOT, "%.1f sec: %d...",
                    (System.currentTimeMillis() - startTime) / 1000.0, i));
        }/* ww  w  .  j a v a2 s.  c om*/
        String s = reader.readLine();
        long t0 = System.nanoTime();
        TokenStream ts = a.tokenStream("field", new StringReader(s));
        ts.reset();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt;
        if (ts.hasAttribute(PositionIncrementAttribute.class)) {
            posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
        } else {
            posIncAtt = null;
        }
        OffsetAttribute offsetAtt;
        if (ts.hasAttribute(OffsetAttribute.class)) {
            offsetAtt = ts.getAttribute(OffsetAttribute.class);
        } else {
            offsetAtt = null;
        }

        while (ts.incrementToken()) {
            hash += 31 * ArrayUtil.hashCode(termAtt.buffer(), 0, termAtt.length());
            if (posIncAtt != null) {
                hash += 31 * posIncAtt.getPositionIncrement();
            }
            if (offsetAtt != null) {
                hash += 31 * offsetAtt.startOffset();
                hash += 31 * offsetAtt.endOffset();
            }
            if (isWarmup == false) {
                tokenCount++;
            }
        }
        ts.end();
        ts.close();

        if (isWarmup == false) {
            sumTime += System.nanoTime() - t0;
        }
    }
    reader.close();

    System.out.println(String.format(Locale.ROOT, "%s time=%.2f msec hash=%d tokens=%d", desc,
            (sumTime / 1000000.0), hash, tokenCount));
}

From source file:practica2_1.Practica2_1.java

public static List<String> tokenizeString(Analyzer analyzer, String string) {
    List<String> result = new ArrayList<String>();

    String cad;//from   ww w .jav  a 2s .  co  m
    try {

        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        //OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute cAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();

        while (stream.incrementToken()) {
            //cad = stream.getAttribute(CharTermAttribute.class).toString();
            result.add(cAtt.toString());
        }
        stream.close();
        stream.end();
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return result;
}

From source file:practica3b.Practica3b.java

public static TopDocs busquedaAuthor(IndexSearcher is, String tipo, String tipo_year, String authors,
        Integer num1, Integer num2, FacetsCollector fc) throws IOException {
    Analyzer analizador = new StandardAnalyzer();
    List<String> palabras = new ArrayList<String>();
    try {//from   www.  j a  v a  2  s . co  m
        TokenStream stream = analizador.tokenStream(null, new StringReader(authors));
        CharTermAttribute catt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            palabras.add(catt.toString());
        }
        stream.close();
        stream.end();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    ArrayList<BooleanClause> bc = new ArrayList<BooleanClause>();
    for (int i = 0; i < palabras.size(); i++) {
        Query query = new TermQuery(new Term("Authors", palabras.get(0)));
        if (tipo.equals("should"))
            bc.add(new BooleanClause(query, BooleanClause.Occur.SHOULD));
        else if (tipo.equals("must"))
            bc.add(new BooleanClause(query, BooleanClause.Occur.MUST));
    }
    BooleanQuery.Builder bqbuilder = new BooleanQuery.Builder();
    for (int i = 0; i < bc.size(); i++) {
        bqbuilder.add(bc.get(i));
    }
    if (num1 != null) {
        Query q;
        if (num2 == null) {
            q = IntPoint.newExactQuery("Year", num1);
            bqbuilder.add(q, BooleanClause.Occur.MUST);
        } else {
            if (tipo_year.equals("range")) {
                q = IntPoint.newRangeQuery("Year", num1, num2);
                bqbuilder.add(q, BooleanClause.Occur.MUST);
            } else {
                q = IntPoint.newSetQuery("Year", num1, num2);
                bqbuilder.add(q, BooleanClause.Occur.MUST);
            }
        }
    }
    BooleanQuery bq = bqbuilder.build();
    fc = new FacetsCollector();
    TopDocs td = FacetsCollector.search(is, bq, 10, fc);
    for (ScoreDoc scoreDoc : td.scoreDocs) {
        Document doc = is.doc(scoreDoc.doc);
        System.out.println(scoreDoc.score + " - " + doc.get("Authors") + " - " + doc.get("Title") + " - Year: "
                + doc.get("Year"));
    }
    return td;
}

From source file:practica3b.Practica3b.java

public static ArrayList<BooleanClause> createClause(String busqueda, int tipo_busqueda, String tipo) {
    Analyzer analizador;/*w  ww.  j a v a  2  s. co  m*/
    List<String> palabras = new ArrayList<String>();
    if (tipo_busqueda == 1) {
        analizador = new EnglishAnalyzer();
    } else if (tipo_busqueda == 2) {
        analizador = new StandardAnalyzer();
    } else {
        analizador = new EnglishAnalyzer();
    }
    try {
        TokenStream stream = analizador.tokenStream(null, new StringReader(busqueda));
        CharTermAttribute catt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            palabras.add(catt.toString());
        }
        stream.close();
        stream.end();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    ArrayList<BooleanClause> bc = new ArrayList<BooleanClause>();
    for (int i = 0; i < palabras.size(); i++) {
        Query q;
        if (tipo_busqueda == 1)
            q = new TermQuery(new Term("Title", palabras.get(i)));
        else if (tipo_busqueda == 2)
            q = new TermQuery(new Term("Authors", palabras.get(i)));
        else
            q = new TermQuery(new Term("Abstract", palabras.get(i)));
        if (tipo.equals("should"))
            bc.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
        else if (tipo.equals("must"))
            bc.add(new BooleanClause(q, BooleanClause.Occur.MUST));
    }
    return bc;
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java

License:Apache License

@Override
public List<Posting> query(String query) {
    try {/*from www. j av a2s  . c om*/
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
        for (CipheredPostingList cipherPostings : cipheredPostings) {
            PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

            PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
            for (TermFreq tf : tfs.getPostings())
                postings.add(new Posting(tf.getDocId(), tf.getFreq()));
            //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
            //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

            Posting posting;
            while ((posting = postings.poll()) != null) {
                //if (!removedDocs.containsKey(posting.getDocId())) {
                int j = finalScores.indexOf(posting);
                if (j == -1)
                    finalScores.add(posting);
                else
                    finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
            }
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java

License:Apache License

@Override
public void addFirstDocuments(String xmlFile) {
    WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(xmlFile);
    try {// w  w  w .j a v  a2s.  com
        wxsp.setPageCallback(new PageCallbackHandler() {
            public void process(WikiPage page) {
                if (page.isDisambiguationPage() || page.isRedirect() || page.isSpecialPage())
                    return;
                List<WordKey> cipheredWords = new ArrayList<WordKey>();
                try {
                    TokenStream ts = analyzer.tokenStream(null,
                            new BufferedReader(new StringReader(page.getText())));
                    try {
                        ts.reset();
                        while (ts.incrementToken()) {
                            String word = ts.getAttribute(CharTermAttribute.class).toString();
                            if (word.length() > 0)
                                cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                        }
                        ts.end();
                    } finally {
                        ts.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
                search.addFirstDocuments(new CDocument(new WordKey(crypto.digest(page.getTitle().getBytes())),
                        cipheredWords.toArray(new WordKey[cipheredWords.size()])));
                //store doc in the cloud
                //            cloud.putDoc(""+i, crypto.encryptDocAES(documents[i]));
            }
        });
        wxsp.parse();
        search.buildIndex();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKIHom.java

License:Apache License

@Override
public List<Posting> query(String query) {
    try {//from w w w . ja v a2s .  c om
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<HomPosting> homScores = search.processQuery(cipheredWords);
        for (HomPosting posting : homScores) {
            finalScores.add(new Posting(posting.getDocId(), crypto.decryptHom(posting.getScore()).intValue()));
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public List<Posting> query(String query) {
    try {//from w w  w. ja v a 2s . com
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
        for (CipheredPostingList cipherPostings : cipheredPostings) {
            PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

            PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
            for (TermFreq tf : tfs.getPostings())
                postings.add(new Posting(tf.getDocId(), tf.getFreq()));
            //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
            //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

            Posting posting;
            while ((posting = postings.poll()) != null) {
                //if (!removedDocs.containsKey(posting.getDocId())) {
                int j = finalScores.indexOf(posting);
                if (j == -1)
                    finalScores.add(posting);
                else
                    finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
            }
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}