List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:di.uniba.it.tri.occ.BuildOccurrence.java
License:Open Source License
private List<String> getTokens(Reader reader) throws IOException { List<String> tokens = new ArrayList<>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream tokenStream = analyzer.tokenStream("text", reader); tokenStream.reset(); CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String token = cattr.toString(); String[] split = token.split("'"); if (split.length == 1) { tokens.add(token);//from www . j a va 2s. c o m } else { int max = 0; int index = 0; for (int i = 0; i < split.length; i++) { if (split[i].length() > max) { max = split[i].length(); index = i; } } tokens.add(split[index]); } } tokenStream.end(); return tokens; }
From source file:doc2vec.LuceneDocIterator.java
String preProcess(Analyzer analyzer, String text) throws Exception { StringBuffer tokenizedContentBuff = new StringBuffer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase();/*from w w w.java 2 s . co m*/ if (labelsStoredWithWords) { term = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM)[0]; // the first part is the word } if (!term.trim().equals("")) tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); return tokenizedContentBuff.toString(); }
From source file:Document.DocumentProcessor.java
public final void processDocument(Document doc) { try {/*from w w w.j a v a 2 s .co m*/ CharArraySet ch = new CharArraySet(Version.LUCENE_48, stopWords, true); TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(doc.getContent())); tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, ch); tokenStream = new PorterStemFilter(tokenStream); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); Set<String> uniqueWords = new HashSet<>(); Map<String, Integer> wordFrequency = new HashMap<String, Integer>(); tokenStream.reset(); while (tokenStream.incrementToken()) { String word = charTermAttribute.toString(); uniqueWords.add(word); if (wordFrequency.containsKey(word)) wordFrequency.put(word, wordFrequency.get(word) + 1); else wordFrequency.put(word, 1); dictionary.add(word); } doc.setUniqueWords(uniqueWords); doc.setWordFrequency(wordFrequency); } catch (IOException ex) { Logger.getLogger(DocumentProcessor.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:drakkar.mast.retrieval.analysis.NGramQuery.java
/** * * @param analyzer//from w w w .j a va2s . com * @param queryTerm * @param field * @throws IOException */ public NGramQuery(Analyzer analyzer, String queryTerm, String field) throws IOException { String words[] = null; //remove white spaces if (queryTerm.contains(" ")) { words = queryTerm.split(" "); } else { words = new String[1]; words[0] = queryTerm; } //one term if (words.length > 1) { for (int i = 0; i < words.length; i++) { String string = words[i]; Term t = new Term(field, string); TermQuery pquery = new TermQuery(t); add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD); } } else { //more than one term for (int i = 0; i < words.length; i++) { String wordToAnalyze = words[i]; TokenStream tokens = analyzer.tokenStream(field, new StringReader(wordToAnalyze)); TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class); tokens.reset(); TermQuery pquery; for (; tokens.incrementToken(); add( new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) { Term t = new Term(field, termAtt.term()); pquery = new TermQuery(t); } tokens.end(); tokens.close(); } } }
From source file:drakkar.mast.retrieval.ngram.NGramQuery.java
/** * * @param a/* w w w. java 2s.c o m*/ * @param queryTerm * @param field * @throws IOException */ public NGramQuery(Analyzer a, String queryTerm, String field) throws IOException { String words[] = null; if (queryTerm.contains(" ")) { words = queryTerm.split(" "); } else { words = new String[1]; words[0] = queryTerm; } if (words.length > 1) { for (int i = 0; i < words.length; i++) { String string = words[i]; Term t = new Term(field, string); TermQuery pquery = new TermQuery(t); add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD); } } else { for (int i = 0; i < words.length; i++) { String wordToAnalyze = words[i]; TokenStream tokens = a.tokenStream(field, new StringReader(wordToAnalyze)); TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class); tokens.reset(); TermQuery pquery; for (; tokens.incrementToken(); add( new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) { Term t = new Term(field, termAtt.term()); pquery = new TermQuery(t); } tokens.end(); tokens.close(); } } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java
License:Open Source License
protected void computeTFIDF(List<TFIDFTerm> wordList, int totalWordsDoc) { if (reader != null && searcher != null) { double tf; double idf; double tfidf; EnglishAnalyzer analyzer = new EnglishAnalyzer(Version.LUCENE_40); TokenStream stream = null; CharTermAttribute termAtt;//w w w. j av a2s .co m String term; double totalWikiDocs = (double) reader.numDocs(); for (TFIDFTerm word : wordList) { try { term = ""; stream = analyzer.tokenStream("field", new StringReader(word.word)); termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { term += (termAtt.toString()); } // System.out.println(term); stream.end(); tf = (double) word.count / (double) totalWordsDoc; double wikiTermFrec = reader.docFreq(new Term("contents", term)); if (wikiTermFrec != 0) { idf = Math.log(totalWikiDocs / wikiTermFrec); tfidf = tf * idf; } else { tfidf = 0; } word.tfidf = tfidf; } catch (IOException ex) { logger.error("Error processing the TFIDF", ex); } finally { try { if (stream != null) { stream.close(); } } catch (IOException ex) { logger.error("Error processing the TFIDF", ex); } } } try { reader.close(); } catch (IOException ex) { logger.warn("Error closing lucene reader", ex); } } }
From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerMapper.java
License:Apache License
@Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { long initCPU = System.nanoTime(); TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(value.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringTuple document = new StringTuple(); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = new String(termAtt.buffer(), 0, termAtt.length()); document.add(term);// w w w .j a v a 2 s . c o m numTerms++; } } elapsedTime += System.nanoTime() - initCPU; context.write(key, document); }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java
License:Apache License
private static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter, Dictionary dictionary) throws IOException { Vector result = new RandomAccessSparseVector(dictionary.size()); TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { // String term = new String(termAtt.buffer(), 0, // termAtt.length()); String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase(); if (filter.accept(term, 0)) { int index = dictionary.get(term); result.setQuick(index, result.get(index) + 1); }// w ww . j a v a 2 s . c o m } return result; }
From source file:edu.indiana.d2i.htrc.io.SparseVectorUtil.java
License:Apache License
public static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter, Dictionary dictionary) throws IOException { Vector result = new RandomAccessSparseVector(dictionary.size()); TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { // String term = new String(termAtt.buffer(), 0, // termAtt.length()); String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase(); if (filter.accept(term, 0)) { int index = dictionary.get(term); result.setQuick(index, result.get(index) + 1); }/* w w w. ja v a 2s .com*/ } return result; }
From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzerTest.java
License:Apache License
public static void main(String[] args) throws IOException { HTRCFilterAnalyzer analyzer = new HTRCFilterAnalyzer(); TokenStream stream = analyzer.reusableTokenStream("field", new StringReader("a iss Pierre 1 Vinken , 61 years old , " + "will join the board as joins a nonexecutive joining director Nov. " + "29 .Mr. car Vinken is cars chairman of Elsevier N.V. , the Dutch " + "publishing group .")); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { System.out.println(new String(termAtt.buffer(), 0, termAtt.length())); }//from w ww . j a v a 2s . c o m } System.out.println("Done???"); }