List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public TokenIndex getTokenIndex(String str) { TokenIndex ret = new TokenIndex(); try {/*from ww w . jav a 2 s. c o m*/ Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(str), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(startOffset, endOffset); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
From source file:net.sf.okapi.lib.tmdb.lucene.Seeker.java
License:Open Source License
public List<TmHit> searchFuzzy(String genericText, String codesAsString, String tmId, String locale, int max, int threshold, HashMap<String, String> attributes) { float searchThreshold = (float) threshold; if (threshold < 0) searchThreshold = 0.0f;//from w w w . jav a 2s .com if (threshold > 100) searchThreshold = 100.0f; String queryText = genericText; String gtextFName = TmEntry.GTEXT_PREFIX + locale; Locale javaLoc = new Locale(locale); // create basic ngram analyzer to tokenize query TokenStream queryTokenStream; if (javaLoc.getLanguage() == Locale.ENGLISH.getLanguage()) { queryTokenStream = defaultFuzzyAnalyzer.tokenStream(gtextFName, new StringReader(queryText)); } else { queryTokenStream = new NgramAnalyzer(javaLoc, 4).tokenStream(gtextFName, new StringReader(queryText)); } // Get the TermAttribute from the TokenStream CharTermAttribute termAtt = (CharTermAttribute) queryTokenStream.addAttribute(CharTermAttribute.class); TmFuzzyQuery fQuery = new TmFuzzyQuery(searchThreshold, gtextFName); try { queryTokenStream.reset(); while (queryTokenStream.incrementToken()) { //Term t = new Term(keyIndexField, new String(termAtt.buffer())); Term t = new Term(gtextFName, termAtt.toString()); fQuery.add(t); } queryTokenStream.end(); queryTokenStream.close(); } catch (IOException e) { throw new OkapiIOException(e.getMessage(), e); } return getFuzzyHits(fQuery, genericText, codesAsString, tmId, locale, max, searchThreshold, attributes); }
From source file:NewsIR_search.TRECQuery.java
/** * Returns the content of the 'queryField' from the query text * @param analyzer//w ww.java 2 s .c om * @param queryField * @return (String) The content of the field * @throws Exception */ public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception { StringBuffer buff = new StringBuffer(); TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }
From source file:ngram.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. * Can handle ngrams of any length and also perform stop word removal before extraction * @param text the text that the ngrams should be extracted from * @param length the length of the ngrams * @param stopWords whether or not stopwords should be removed before extraction * @param overlap whether or not the ngrams should overlap *///from w w w . j a v a 2 s . c om public void extract(String text, int length, Boolean stopWords, Boolean overlap) throws FileNotFoundException, IOException { this.text = text; this.length = length; this.stopWords = stopWords; this.overlap = overlap; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* If the minLength and maxLength are both 1, then we want unigrams * Make use of a StopAnalyzer when stopwords should be removed * Make use of a SimpleAnalyzer when stop words should be included */ if (length == 1) { if (this.stopWords) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } else { analyzer = new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET); //Changed from simple to standard to include apostrophe/s } } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_24), length, length, " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words. } else { analyzer = new ShingleAnalyzerWrapper( new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET), length, length, " ", false, false); //Changed from simple to standard to include apostrophe/s } } //Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); int tokenCount = 0; while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); //The actual token term nGrams.add(termToken); //Add all ngrams to the ngram LinkedList //If n-grams are not allowed to overlap, then increment to point of no overlap if (!overlap) { for (int i = 0; i < length - 1; i++) { tokenStream.incrementToken(); } } } //Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }
From source file:nicta.com.au.failureanalysis.optimalquery.OptPatentQuery.java
private String transformation(TokenStream ts, int treshold, String field) throws IOException { Map<String, Integer> m = new HashMap<>(); String q = ""; CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();/*from w w w . ja v a2 s . c om*/ int s = 0; while (ts.incrementToken()) { String term = charTermAttribute.toString().replace(":", "\\:"); q += term + " "; if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1); } s++; } ts.close(); // return q; q = ""; for (String k : m.keySet()) { if (m.get(k) >= treshold) { if (!Functions.isNumeric(k)) { q += k + "^" + m.get(k) + " "; // System.out.println(k); } } } if (field != null) { vocabulary.put(field, m); } fieldsSize.put(field, s); return q; }
From source file:nicta.com.au.failureanalysis.query.QueryGneration.java
private Map<String, Integer> getTerms(TokenStream ts, int treshold, String field) throws IOException { Map<String, Integer> m = new HashMap<>(); Map<String, Integer> qterm_freq = new HashMap<>(); String q = ""; CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();/*from w w w. j a v a2 s . c o m*/ int s = 0; while (ts.incrementToken()) { String term = charTermAttribute.toString().replace(":", "\\:"); q += term + " "; if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1); } s++; } ts.close(); // return q; q = ""; // int count = 0; for (String k : m.keySet()) { if (m.get(k) >= treshold) { if (!Functions.isNumeric(k)) { q += k + "^" + m.get(k) + " "; qterm_freq.put(k, m.get(k)); // count++; // System.out.println(count + " " + k + " " + m.get(k)); } } } // System.out.println("-------------------"); if (field != null) { vocabulary.put(field, m); } fieldsSize.put(field, s); // return q; return qterm_freq; }
From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java
private Map<String, Double> getVector(TokenStream ts, String field) throws IOException, Exception { Map<String, Double> m = new HashMap<>(); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();/*from w ww .ja va 2 s. c om*/ int i = 0; while (ts.incrementToken()) { i++; String term = charTermAttribute.toString(); if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1.0); } } for (String key : m.keySet()) { Term t = new Term(field, key); int totalTF = ir.docFreq(t); int docs = ir.getDocCount("claims"); double idf = Math.log10((double) docs / (totalTF + 1)); m.put(key, (m.get(key) / i) * idf); } return m; }
From source file:nicta.com.au.patent.pac.analysis.FieldsJaccardSimilarities.java
private Map<String, Integer> transformation(TokenStream ts) throws IOException { Map<String, Integer> m = new HashMap<>(); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();// w ww . j av a 2 s . co m while (ts.incrementToken()) { String term = charTermAttribute.toString(); if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1); } } return m; }
From source file:nicta.com.au.patent.pac.analysis.RecallAnalysis.java
private Set<String> transformation(TokenStream ts) throws IOException { Set<String> out = new HashSet<>(); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();//from w w w.ja v a 2 s .co m while (ts.incrementToken()) { String term = charTermAttribute.toString(); out.add(term); } return out; }
From source file:nicta.com.au.patent.pac.search.PatentQuery.java
private String transformation(TokenStream ts, int treshold, String field) throws IOException { Map<String, Integer> m = new HashMap<>(); String q = ""; CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();//from w w w. j a v a 2s . c o m int s = 0; while (ts.incrementToken()) { String term = charTermAttribute.toString().replace(":", "\\:"); q += term + " "; if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1); } s++; } ts.close(); // return q; q = ""; for (String k : m.keySet()) { if (m.get(k) >= treshold) { if (!Functions.isNumeric(k)) { // q += k + "^" + m.get(k) + " "; q += k + "^" + 1/*m.get(k)*/ + " "; // System.out.println(k); } } } if (field != null) { vocabulary.put(field, m); } fieldsSize.put(field, s); return q; }