List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?NGramTokenizer/*from ww w. ja va2s . c o m*/ * min:1,max:2 */ public void testNT() { Tokenizer tokenizer = new NGramTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }
From source file:org.wltea.analyzer.sample.ThulacAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //Thulac?smart?? Analyzer analyzer = new ThulacAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try {/*from w w w . ja v a 2 s. c o m*/ long start = System.currentTimeMillis(); ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. System.out.println("wast:" + (System.currentTimeMillis() - start)); } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.wltea.analyzer.test.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //IK?smart??//from www. ja va 2 s .c om Analyzer analyzer = new IKAnalyzer4PinYin(true); //?LuceneTokenStream TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.zenoss.zep.index.impl.lucene.LuceneQueryBuilder.java
License:Open Source License
/** * Tokenizes the given query using the same behavior as when the field is analyzed. * * @param fieldName The field name in the index. * @param analyzer The analyzer to use to tokenize the query. * @param query The query to tokenize. * @return The tokens from the query.//from w ww.j a v a2 s . com * @throws ZepException If an exception occur. */ private static List<String> getTokens(String fieldName, Analyzer analyzer, String query) throws ZepException { final List<String> tokens = new ArrayList<String>(); try { TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(query)); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { tokens.add(term.toString()); } ts.end(); } catch (IOException e) { throw new ZepException(e.getLocalizedMessage(), e); } finally { ts.close(); } } catch (IOException e) { throw new ZepException(e.getLocalizedMessage(), e); } return tokens; }
From source file:perf.TestAnalyzerPerf.java
License:Apache License
private static void testAnalyzer(String desc, File wikiLinesFile, Analyzer a, int warmupCount, int runCount) throws Exception { System.out.println("\nTEST: " + desc); // 64 KB buffer InputStream is = new FileInputStream(wikiLinesFile); BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); long startTime = System.currentTimeMillis(); long sumTime = 0; long hash = 0; long tokenCount = 0; int totCount = warmupCount + runCount; for (int i = 0; i < totCount; i++) { boolean isWarmup = i < warmupCount; if (i % 10000 == 0) { System.out.println(String.format(Locale.ROOT, "%.1f sec: %d...", (System.currentTimeMillis() - startTime) / 1000.0, i)); }//www . ja v a 2 s . c om String s = reader.readLine(); long t0 = System.nanoTime(); TokenStream ts = a.tokenStream("field", new StringReader(s)); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt; if (ts.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); } else { posIncAtt = null; } OffsetAttribute offsetAtt; if (ts.hasAttribute(OffsetAttribute.class)) { offsetAtt = ts.getAttribute(OffsetAttribute.class); } else { offsetAtt = null; } while (ts.incrementToken()) { hash += 31 * ArrayUtil.hashCode(termAtt.buffer(), 0, termAtt.length()); if (posIncAtt != null) { hash += 31 * posIncAtt.getPositionIncrement(); } if (offsetAtt != null) { hash += 31 * offsetAtt.startOffset(); hash += 31 * offsetAtt.endOffset(); } if (isWarmup == false) { tokenCount++; } } ts.end(); ts.close(); if (isWarmup == false) { sumTime += System.nanoTime() - t0; } } reader.close(); System.out.println(String.format(Locale.ROOT, "%s time=%.2f msec hash=%d tokens=%d", desc, (sumTime / 1000000.0), hash, tokenCount)); }
From source file:practica2_1.Practica2_1.java
public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); String cad;//from w w w .j ava 2 s . co m try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); //OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); CharTermAttribute cAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { //cad = stream.getAttribute(CharTermAttribute.class).toString(); result.add(cAtt.toString()); } stream.close(); stream.end(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; }
From source file:practica3b.Practica3b.java
public static TopDocs busquedaAuthor(IndexSearcher is, String tipo, String tipo_year, String authors, Integer num1, Integer num2, FacetsCollector fc) throws IOException { Analyzer analizador = new StandardAnalyzer(); List<String> palabras = new ArrayList<String>(); try {/*w ww .j a v a 2 s. c o m*/ TokenStream stream = analizador.tokenStream(null, new StringReader(authors)); CharTermAttribute catt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { palabras.add(catt.toString()); } stream.close(); stream.end(); } catch (IOException e) { throw new RuntimeException(e); } ArrayList<BooleanClause> bc = new ArrayList<BooleanClause>(); for (int i = 0; i < palabras.size(); i++) { Query query = new TermQuery(new Term("Authors", palabras.get(0))); if (tipo.equals("should")) bc.add(new BooleanClause(query, BooleanClause.Occur.SHOULD)); else if (tipo.equals("must")) bc.add(new BooleanClause(query, BooleanClause.Occur.MUST)); } BooleanQuery.Builder bqbuilder = new BooleanQuery.Builder(); for (int i = 0; i < bc.size(); i++) { bqbuilder.add(bc.get(i)); } if (num1 != null) { Query q; if (num2 == null) { q = IntPoint.newExactQuery("Year", num1); bqbuilder.add(q, BooleanClause.Occur.MUST); } else { if (tipo_year.equals("range")) { q = IntPoint.newRangeQuery("Year", num1, num2); bqbuilder.add(q, BooleanClause.Occur.MUST); } else { q = IntPoint.newSetQuery("Year", num1, num2); bqbuilder.add(q, BooleanClause.Occur.MUST); } } } BooleanQuery bq = bqbuilder.build(); fc = new FacetsCollector(); TopDocs td = FacetsCollector.search(is, bq, 10, fc); for (ScoreDoc scoreDoc : td.scoreDocs) { Document doc = is.doc(scoreDoc.doc); System.out.println(scoreDoc.score + " - " + doc.get("Authors") + " - " + doc.get("Title") + " - Year: " + doc.get("Year")); } return td; }
From source file:practica3b.Practica3b.java
public static ArrayList<BooleanClause> createClause(String busqueda, int tipo_busqueda, String tipo) { Analyzer analizador;// w w w . j a v a 2 s .co m List<String> palabras = new ArrayList<String>(); if (tipo_busqueda == 1) { analizador = new EnglishAnalyzer(); } else if (tipo_busqueda == 2) { analizador = new StandardAnalyzer(); } else { analizador = new EnglishAnalyzer(); } try { TokenStream stream = analizador.tokenStream(null, new StringReader(busqueda)); CharTermAttribute catt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { palabras.add(catt.toString()); } stream.close(); stream.end(); } catch (IOException e) { throw new RuntimeException(e); } ArrayList<BooleanClause> bc = new ArrayList<BooleanClause>(); for (int i = 0; i < palabras.size(); i++) { Query q; if (tipo_busqueda == 1) q = new TermQuery(new Term("Title", palabras.get(i))); else if (tipo_busqueda == 2) q = new TermQuery(new Term("Authors", palabras.get(i))); else q = new TermQuery(new Term("Abstract", palabras.get(i))); if (tipo.equals("should")) bc.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)); else if (tipo.equals("must")) bc.add(new BooleanClause(q, BooleanClause.Occur.MUST)); } return bc; }
From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java
License:Apache License
@Override public List<Posting> query(String query) { try {//w w w .ja v a 2s . c o m List<Posting> finalScores = new ArrayList<Posting>(12); List<WordKey> cipheredWords = new LinkedList<WordKey>(); TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords); for (CipheredPostingList cipherPostings : cipheredPostings) { PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings()); PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size()); for (TermFreq tf : tfs.getPostings()) postings.add(new Posting(tf.getDocId(), tf.getFreq())); //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(), // docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths))); Posting posting; while ((posting = postings.poll()) != null) { //if (!removedDocs.containsKey(posting.getDocId())) { int j = finalScores.indexOf(posting); if (j == -1) finalScores.add(posting); else finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore()); } } Collections.sort(finalScores); if (finalScores.size() > 12) return finalScores.subList(0, 12); else return finalScores; } catch (IOException e) { e.printStackTrace(); return null; } }
From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java
License:Apache License
@Override public void addFirstDocuments(String xmlFile) { WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(xmlFile); try {/* w w w .jav a 2s . com*/ wxsp.setPageCallback(new PageCallbackHandler() { public void process(WikiPage page) { if (page.isDisambiguationPage() || page.isRedirect() || page.isSpecialPage()) return; List<WordKey> cipheredWords = new ArrayList<WordKey>(); try { TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(page.getText()))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } } catch (IOException e) { e.printStackTrace(); } search.addFirstDocuments(new CDocument(new WordKey(crypto.digest(page.getTitle().getBytes())), cipheredWords.toArray(new WordKey[cipheredWords.size()]))); //store doc in the cloud // cloud.putDoc(""+i, crypto.encryptDocAES(documents[i])); } }); wxsp.parse(); search.buildIndex(); } catch (Exception e) { e.printStackTrace(); } }