List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:searching.QueryExpansion.java
/** * calculate positional relevance weights * // w w w. j av a 2 s .com * @param query * @param text * @param doc_score * @param analyzer * @param reader * @throws IOException */ public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer, IndexReader reader) throws IOException { //System.out.println(query); //System.out.println(text); if (actual_pdocs < QueryExpansion.pseudo_rel_docs) { TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap(); Integer length = 0; Long pos = 1L; String term; TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text)); ArrayList<Long> qpos; //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { //System.out.print(pos + ":" + term + " "); if (query.contains(term)) { qpos = query_term_pos.get(term); if (qpos == null) { qpos = new ArrayList<>(); } qpos.add(pos); query_term_pos.put(term, qpos); } length++; pos++; } } ts.end(); } finally { ts.close(); } // // All positions collected // now iterate over the document again to get weights // //System.out.println("Doc length" + text.length()); //System.out.println("Positions... "); //System.out.println(query_term_pos.toString()); //System.out.println("END..."); TreeMap<String, Double> map = new TreeMap(); Double f; pos = 1L; double w, w_norm, prob, f0; Double pos_length = 0.0; Double sum_df = (double) reader.getSumDocFreq("text"); double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega) + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega); Double df; double dist; ts = analyzer.tokenStream("myfield", new StringReader(text)); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { prob = 0.0; //f is occurrence w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma); for (String qt : query_term_pos.keySet()) { ArrayList<Long> pos_list = query_term_pos.get(qt); w = 1.0; df = (double) reader.docFreq(new Term("text", qt)); for (Long p : pos_list) { dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma); f0 = Math.exp(-dist); //if (QueryExpansion.method == QueryExpansion.PRM2QTM){ //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df)))); // w += f0; //}else{ w += f0; //} } //System.out.println("weight " + w ); prob += Math.log(w / w_norm); } //System.out.print(pos + "\t" + term + "\t" + Math.exp(prob) + "\n"); /** sum of the probabilities over the positional terms in the documents*/ f = map.get(term); if (f == null) { map.put(term, Math.exp(prob)); } else { map.put(term, f + Math.exp(prob)); } pos_length += Math.exp(prob); pos++; } } ts.end(); } finally { ts.close(); } double sum = 0.0; for (String word : map.keySet()) { //logger.info(word + "\t" + map.get(word)/pos_length); sum += map.get(word) / pos_length; } //logger.info("sum is " + sum); pdocs[actual_pdocs] = map; pdoc_lengths[actual_pdocs] = length; pdoc_positional_lengths[actual_pdocs] = pos_length; pdoc_scores[actual_pdocs] = doc_score; actual_pdocs++; } }
From source file:snu.controladores.indexador.Parser.java
/** * Realiza a tokenizao de uma string (Pega as palavras com split e extrai * seu radical)//from ww w . j a v a 2 s .c o m * * @param analyzer * @param string * @return * @throws IOException */ private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException { List<String> result = new ArrayList<>(); TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } return result; }
From source file:snu.controladores.indexador.ProcessadorDeConsultas.java
/** * Realiza a tokenizao de uma string (Pega as palavras com split e extrai * seu radical)/*from w ww. j a v a 2 s. c om*/ * * @param analyzer * @param string * @return * @throws IOException */ private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException { List<String> result = new ArrayList<>(); TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } return result; }
From source file:StopWords.StopWords.java
public String removeStopwords(String input) { TokenStream tokenStream = new ClassicTokenizer(Version.LUCENE_35, new StringReader(input)); // remove stop words tokenStream = new StopFilter(Version.LUCENE_35, tokenStream, EnglishAnalyzer.getDefaultStopSet()); // retrieve the remaining tokens Set<String> tokens = new HashSet<String>(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); String str = ""; try {/*from w w w . j av a 2 s . co m*/ tokenStream.reset(); } catch (IOException ex) { Logger.getLogger(StopWords.class.getName()).log(Level.SEVERE, null, ex); } try { while (tokenStream.incrementToken()) { tokens.add(token.toString()); str += token.toString() + " "; //System.out.println(token.toString()); } } catch (IOException e) { // log } return str; }
From source file:summarizer.KeywordsGuesser.java
License:Open Source License
public static String stemmize(String term) throws IOException { TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(term)); tokenStream = new PorterStemFilter(tokenStream); Set<String> stems = new HashSet<String>(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset();/* ww w.j av a2 s .c o m*/ while (tokenStream.incrementToken()) { stems.add(token.toString()); } if (stems.size() != 1) { return null; } String stem = stems.iterator().next(); if (!stem.matches("[\\w-]+")) { return null; } return stem; }
From source file:summarizer.KeywordsGuesser.java
License:Open Source License
public static List<Keyword> guessFromString(String input) throws IOException { input = input.replaceAll("-+", "-0"); input = input.replaceAll("[\\p{Punct}&&[^'-]]+", " "); input = input.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", ""); TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(input)); tokenStream = new LowerCaseFilter(LUCENE_VERSION, tokenStream); tokenStream = new ClassicFilter(tokenStream); tokenStream = new ASCIIFoldingFilter(tokenStream); tokenStream = new StopFilter(LUCENE_VERSION, tokenStream, EnglishAnalyzer.getDefaultStopSet()); List<Keyword> keywords = new LinkedList<Keyword>(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset();/* www .j a va 2s .c o m*/ while (tokenStream.incrementToken()) { String term = token.toString(); String stem = stemmize(term); if (stem != null) { Keyword keyword = find(keywords, new Keyword(stem.replaceAll("-0", "-"))); keyword.add(term.replaceAll("-0", "-")); } } Collections.sort(keywords); return keywords; }
From source file:TesterClasses.TestAnalyzer.java
public static List tokenizeString(Analyzer analyzer, String str) { List result = new ArrayList<>(); try {/*w w w .jav a 2 s . com*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(str)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; }
From source file:tfidf.TestTfIDF.java
License:CDDL license
public static ArrayList<String> cutWords(String line) throws IOException { ArrayList<String> words = new ArrayList<String>(); // String text = ReadFiles.readFile(file); IKAnalyzer analyzer = new IKAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(line)); tokenStream.reset();// www. j av a 2 s .c o m while (tokenStream.incrementToken()) { CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); // System.out.println(termAttribute.toString()+"\t"+i); words.add(termAttribute.toString()); } return words; }
From source file:tw.com.kyle.luminance.LumPositionMap.java
public static LumPositionMap Get(String raw_text) throws IOException { StandardAnalyzer analyzer = new StandardAnalyzer(); TokenStream tstream = analyzer.tokenStream("", raw_text); CharTermAttribute termAttr = tstream.getAttribute(CharTermAttribute.class); OffsetAttribute offAttr = tstream.getAttribute(OffsetAttribute.class); // PositionIncrementAttribute posIncAttr = tstream.getAttribute(PositionIncrementAttribute.class); // PositionLengthAttribute posLenAttr = tstream.getAttribute(PositionLengthAttribute.class); List<String> tokens = new ArrayList<>(); List<Integer> pos_list = new ArrayList<>(); int pos_counter = 0; tstream.reset();// w w w . ja v a 2 s . c om while (tstream.incrementToken()) { tokens.add(termAttr.toString()); pos_list.add(offAttr.startOffset()); } return new LumPositionMap(tokens, pos_list); }
From source file:tw.com.kyle.luminance.LumWindow.java
public List<LumRange> BuildLumRange(long annot_uuid) throws IOException { Document adoc = lum_annot.GetAnnotDocument(annot_uuid); if (adoc == null) { return new ArrayList<>(); }/*from w w w .jav a 2 s . c o m*/ int doc_id = lum_reader.getDocId(adoc); TokenStream tokenStream = lum_reader.GetTokenStream(doc_id, "anno"); if (tokenStream == null) { return null; } OffsetAttribute offAttr = tokenStream.getAttribute(OffsetAttribute.class); CharTermAttribute chAttr = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); List<LumRange> lr_list = new ArrayList<>(); while (tokenStream.incrementToken()) { LumRange lr = new LumRange(); lr.data = chAttr.toString(); lr.start_off = offAttr.startOffset(); lr.end_off = offAttr.endOffset(); lr_list.add(lr); } return lr_list; }