List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java
public void testNextToken1() throws Exception { ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null); TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG1)); TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR1)); boolean hasMore = ts1.incrementToken(); ts2.incrementToken();//from w w w . j av a 2s.c o m TermAttribute t1 = (TermAttribute) ts1 .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); TermAttribute t2 = (TermAttribute) ts2 .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); while (hasMore) { assertEquals(new String(t1.termBuffer(), 0, t1.termLength()), new String(t2.termBuffer(), 0, t2.termLength())); hasMore = ts1.incrementToken(); ts2.incrementToken(); t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); } }
From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java
public void testNextToken2() throws Exception { ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null); TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG2)); TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR2)); boolean hasMore = ts1.incrementToken(); ts2.incrementToken();/*from w w w . j a va 2s . com*/ TermAttribute t1 = (TermAttribute) ts1 .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); TermAttribute t2 = (TermAttribute) ts2 .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); while (hasMore) { assertEquals(new String(t1.termBuffer(), 0, t1.termLength()), new String(t2.termBuffer(), 0, t2.termLength())); hasMore = ts1.incrementToken(); ts2.incrementToken(); t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class); } }
From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java
License:Apache License
@Override public String[] getQueryTokens(final String queryString) { TokenStream tokenStream = null; try {/*from w w w .j a v a 2 s . c o m*/ tokenStream = getDefaultAnalyzer().tokenStream("QUERY_TOKENS", new StringReader(queryString)); tokenStream.reset(); final ArrayList<String> al = new ArrayList<>(); while (tokenStream.incrementToken()) { final String term = tokenStream.getAttribute(CharTermAttribute.class).toString(); if (term != null && term.length() > 1) { al.add(term); } } if (al.size() == 0) { al.add(queryString); } return al.toArray(new String[al.size()]); } catch (final IOException e) { throw ADOException.of(e); } finally { if (tokenStream != null) { try { tokenStream.close(); } catch (final IOException e) { } } } }
From source file:net.skyatlas.icd.dao.daoImpl.AnsjAnalysisTest.java
@Test public void test() throws IOException { Token nt = new Token(); Analyzer ca = new AnsjAnalysis(); Reader sentence = new StringReader( "\n\n\n\n\n\n\n????, ????????????????????????????" + "???????????????????" + "??????????? ??????????????2????" + "" + "? ????????????? ??? ????????"); TokenStream ts = ca.tokenStream("sentence", sentence); System.out.println("start: " + (new Date())); long before = System.currentTimeMillis(); while (ts.incrementToken()) { System.out.println(ts.getAttribute(CharTermAttribute.class)); }/*from w w w . j av a 2 s .c o m*/ ts.close(); long now = System.currentTimeMillis(); System.out.println("time: " + (now - before) / 1000.0 + " s"); }
From source file:net.skyatlas.icd.test.AnsegTest.java
static public void main(String[] args) throws IOException, CorruptIndexException, ParseException, InvalidTokenOffsetsException { AnsegTest inst = new AnsegTest(); Token nt = new Token(); Analyzer ca = new AnsjAnalysis(); Reader sentence = new StringReader( "\n\n\n\n\n\n\n????, ????????????????????????????" + "???????????????????" + "??????????? ??????????????2????" + "" + "? ????????????? ??? ????????"); TokenStream ts = ca.tokenStream("sentence", sentence); System.out.println("start: " + (new Date())); long before = System.currentTimeMillis(); while (ts.incrementToken()) { System.out.println(ts.getAttribute(CharTermAttribute.class)); }// w w w.ja v a2 s. c o m ts.close(); long now = System.currentTimeMillis(); System.out.println("time: " + (now - before) / 1000.0 + " s"); HashSet<String> hs = new HashSet<String>(); BufferedReader reader2 = IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"), "UTF-8"); String word = null; while ((word = reader2.readLine()) != null) { hs.add(word); } Analyzer analyzer = new AnsjAnalysis(hs, false); Directory directory = null; IndexWriter iwriter = null; BufferedReader reader = IOUtil.getReader("/Users/changzhenghe/Downloads/hy_statspack01.txt", "UTF-8"); String temp = null; StringBuilder sb = new StringBuilder(); while ((temp = reader.readLine()) != null) { sb.append(temp); sb.append("\n"); } reader.close(); String text = sb.toString(); text = "???????????? ??? ????????"; IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer); // directory = new RAMDirectory(); iwriter = new IndexWriter(directory, ic); // BufferedReader reader = // IOUtil.getReader("/Users/ansj/Documents//?//1998?_.txt", // "GBK"); // String temp = null; // while ((temp = reader.readLine()) != null) { // addContent(iwriter, temp); // } inst.addContent(iwriter, "? ?() (?)"); inst.addContent(iwriter, " ?() (?)"); inst.addContent(iwriter, "? ? (?)"); inst.addContent(iwriter, " ??NEC "); inst.addContent(iwriter, "?"); iwriter.commit(); iwriter.close(); System.out.println(""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, "?"); /* KeyWordComputer kwc = new KeyWordComputer(5); String title = "??"; String content = "9??" + "?????????" + "????" + "??" + "?????" + "???" + "??????" + "???" + "????20??" + "????" + "?" + "???]??" + "???"; Collection<Keyword> result = kwc.computeArticleTfidf(title, content); System.out.println(result); AnsegTest t = new AnsegTest(); List<Term> parse = ToAnalysis.parse("?"); System.out.println(parse); System.out.println("*********** ? ************"); // UserDefineLibrary.insertWord("", "userDefine", 1000); // UserDefineLibrary.insertWord("?", "userDefine", 1000); UserDefineLibrary.insertWord("?", "userDefine", 1000); parse = ToAnalysis.parse("???"); System.out.println(parse); */ }
From source file:NewsIR_search.TRECQuery.java
/** * Returns the content of the 'queryField' from the query text * @param analyzer//from w w w . jav a 2 s . c o m * @param queryField * @return (String) The content of the field * @throws Exception */ public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception { StringBuffer buff = new StringBuffer(); TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }
From source file:ngram.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. * Can handle ngrams of any length and also perform stop word removal before extraction * @param text the text that the ngrams should be extracted from * @param length the length of the ngrams * @param stopWords whether or not stopwords should be removed before extraction * @param overlap whether or not the ngrams should overlap *///from w w w . j a v a2s .c o m public void extract(String text, int length, Boolean stopWords, Boolean overlap) throws FileNotFoundException, IOException { this.text = text; this.length = length; this.stopWords = stopWords; this.overlap = overlap; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* If the minLength and maxLength are both 1, then we want unigrams * Make use of a StopAnalyzer when stopwords should be removed * Make use of a SimpleAnalyzer when stop words should be included */ if (length == 1) { if (this.stopWords) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } else { analyzer = new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET); //Changed from simple to standard to include apostrophe/s } } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_24), length, length, " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words. } else { analyzer = new ShingleAnalyzerWrapper( new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET), length, length, " ", false, false); //Changed from simple to standard to include apostrophe/s } } //Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); int tokenCount = 0; while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); //The actual token term nGrams.add(termToken); //Add all ngrams to the ngram LinkedList //If n-grams are not allowed to overlap, then increment to point of no overlap if (!overlap) { for (int i = 0; i < length - 1; i++) { tokenStream.incrementToken(); } } } //Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }
From source file:nicta.com.au.failureanalysis.optimalquery.OptPatentQuery.java
private String transformation(TokenStream ts, int treshold, String field) throws IOException { Map<String, Integer> m = new HashMap<>(); String q = ""; CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();//from w ww. j a v a 2 s . co m int s = 0; while (ts.incrementToken()) { String term = charTermAttribute.toString().replace(":", "\\:"); q += term + " "; if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1); } s++; } ts.close(); // return q; q = ""; for (String k : m.keySet()) { if (m.get(k) >= treshold) { if (!Functions.isNumeric(k)) { q += k + "^" + m.get(k) + " "; // System.out.println(k); } } } if (field != null) { vocabulary.put(field, m); } fieldsSize.put(field, s); return q; }
From source file:nicta.com.au.failureanalysis.query.QueryGneration.java
private Map<String, Integer> getTerms(TokenStream ts, int treshold, String field) throws IOException { Map<String, Integer> m = new HashMap<>(); Map<String, Integer> qterm_freq = new HashMap<>(); String q = ""; CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();//from w w w . j a v a 2 s. c o m int s = 0; while (ts.incrementToken()) { String term = charTermAttribute.toString().replace(":", "\\:"); q += term + " "; if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1); } s++; } ts.close(); // return q; q = ""; // int count = 0; for (String k : m.keySet()) { if (m.get(k) >= treshold) { if (!Functions.isNumeric(k)) { q += k + "^" + m.get(k) + " "; qterm_freq.put(k, m.get(k)); // count++; // System.out.println(count + " " + k + " " + m.get(k)); } } } // System.out.println("-------------------"); if (field != null) { vocabulary.put(field, m); } fieldsSize.put(field, s); // return q; return qterm_freq; }
From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java
private Map<String, Double> getVector(TokenStream ts, String field) throws IOException, Exception { Map<String, Double> m = new HashMap<>(); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();//from ww w .j av a 2s . c o m int i = 0; while (ts.incrementToken()) { i++; String term = charTermAttribute.toString(); if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1.0); } } for (String key : m.keySet()) { Term t = new Term(field, key); int totalTF = ir.docFreq(t); int docs = ir.getDocCount("claims"); double idf = Math.log10((double) docs / (totalTF + 1)); m.put(key, (m.get(key) / i) * idf); } return m; }