List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java
License:Apache License
@Override public String[] getQueryTokens(final String queryString) { TokenStream tokenStream = null; try {// w w w .ja va2 s . c om tokenStream = getDefaultAnalyzer().tokenStream("QUERY_TOKENS", new StringReader(queryString)); tokenStream.reset(); final ArrayList<String> al = new ArrayList<>(); while (tokenStream.incrementToken()) { final String term = tokenStream.getAttribute(CharTermAttribute.class).toString(); if (term != null && term.length() > 1) { al.add(term); } } if (al.size() == 0) { al.add(queryString); } return al.toArray(new String[al.size()]); } catch (final IOException e) { throw ADOException.of(e); } finally { if (tokenStream != null) { try { tokenStream.close(); } catch (final IOException e) { } } } }
From source file:NewsIR_search.TRECQuery.java
/** * Returns the content of the 'queryField' from the query text * @param analyzer/*from w w w.ja va2 s.co m*/ * @param queryField * @return (String) The content of the field * @throws Exception */ public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception { StringBuffer buff = new StringBuffer(); TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }
From source file:nicta.com.au.failureanalysis.optimalquery.OptPatentQuery.java
private String transformation(TokenStream ts, int treshold, String field) throws IOException { Map<String, Integer> m = new HashMap<>(); String q = ""; CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset(); int s = 0;//from w ww .j av a 2 s.c om while (ts.incrementToken()) { String term = charTermAttribute.toString().replace(":", "\\:"); q += term + " "; if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1); } s++; } ts.close(); // return q; q = ""; for (String k : m.keySet()) { if (m.get(k) >= treshold) { if (!Functions.isNumeric(k)) { q += k + "^" + m.get(k) + " "; // System.out.println(k); } } } if (field != null) { vocabulary.put(field, m); } fieldsSize.put(field, s); return q; }
From source file:nicta.com.au.failureanalysis.query.QueryGneration.java
private Map<String, Integer> getTerms(TokenStream ts, int treshold, String field) throws IOException { Map<String, Integer> m = new HashMap<>(); Map<String, Integer> qterm_freq = new HashMap<>(); String q = ""; CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset(); int s = 0;//from ww w. ja v a 2 s. c om while (ts.incrementToken()) { String term = charTermAttribute.toString().replace(":", "\\:"); q += term + " "; if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1); } s++; } ts.close(); // return q; q = ""; // int count = 0; for (String k : m.keySet()) { if (m.get(k) >= treshold) { if (!Functions.isNumeric(k)) { q += k + "^" + m.get(k) + " "; qterm_freq.put(k, m.get(k)); // count++; // System.out.println(count + " " + k + " " + m.get(k)); } } } // System.out.println("-------------------"); if (field != null) { vocabulary.put(field, m); } fieldsSize.put(field, s); // return q; return qterm_freq; }
From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java
private Map<String, Double> getVector(TokenStream ts, String field) throws IOException, Exception { Map<String, Double> m = new HashMap<>(); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset(); int i = 0;//from ww w . j a v a 2 s. co m while (ts.incrementToken()) { i++; String term = charTermAttribute.toString(); if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1.0); } } for (String key : m.keySet()) { Term t = new Term(field, key); int totalTF = ir.docFreq(t); int docs = ir.getDocCount("claims"); double idf = Math.log10((double) docs / (totalTF + 1)); m.put(key, (m.get(key) / i) * idf); } return m; }
From source file:nicta.com.au.patent.pac.analysis.FieldsJaccardSimilarities.java
private Map<String, Integer> transformation(TokenStream ts) throws IOException { Map<String, Integer> m = new HashMap<>(); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String term = charTermAttribute.toString(); if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else {//from w ww . j a v a 2s . c o m m.put(term, 1); } } return m; }
From source file:nicta.com.au.patent.pac.analysis.RecallAnalysis.java
private Set<String> transformation(TokenStream ts) throws IOException { Set<String> out = new HashSet<>(); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String term = charTermAttribute.toString(); out.add(term);/*www.j a v a2s .c o m*/ } return out; }
From source file:nicta.com.au.patent.pac.search.PatentQuery.java
private String transformation(TokenStream ts, int treshold, String field) throws IOException { Map<String, Integer> m = new HashMap<>(); String q = ""; CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset(); int s = 0;/*w w w . j a va 2 s. c o m*/ while (ts.incrementToken()) { String term = charTermAttribute.toString().replace(":", "\\:"); q += term + " "; if (m.containsKey(term)) { m.put(term, m.get(term) + 1); } else { m.put(term, 1); } s++; } ts.close(); // return q; q = ""; for (String k : m.keySet()) { if (m.get(k) >= treshold) { if (!Functions.isNumeric(k)) { // q += k + "^" + m.get(k) + " "; q += k + "^" + 1/*m.get(k)*/ + " "; // System.out.println(k); } } } if (field != null) { vocabulary.put(field, m); } fieldsSize.put(field, s); return q; }
From source file:nl.b3p.viewer.stripes.CatalogSearchActionBean.java
License:Open Source License
private static Or createOrFilter(String queryString, String propertyName) { List orList = new ArrayList(); queryString = createQueryString(queryString, false); if (queryString != null && !queryString.trim().equals(defaultWildCard)) { propertyName = createPropertyName(propertyName); PropertyIsEqualTo propertyIsEqualTo = FilterCreator.createPropertyIsEqualTo(queryString, propertyName); StandardAnalyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_45, DutchAnalyzer.getDefaultStopSet()); orList.add(propertyIsEqualTo);// w w w.ja v a 2 s . c om try { TokenStream tokenStream = standardAnalyzer.tokenStream("", queryString); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(term, propertyName); orList.add(propertyIsLike); } tokenStream.close(); } catch (IOException e) { PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(queryString, propertyName); orList.add(propertyIsLike); } } Or or = new Or(new BinaryLogicOpType(orList)); return or; }
From source file:nl.cwi.helpers.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. * Can handle ngrams of any length and also perform stop word removal before extraction * @param text the text that the ngrams should be extracted from * @param minLength the minimum length of the ngrams * @param maxLength the maximum length of the ngrams * @param stopWords whether or not stopwords should be removed before extraction */// w ww.j ava 2s .com public void extract(String text, int minLength, int maxLength, Boolean stopWords) throws FileNotFoundException, IOException { this.text = text; this.minLength = minLength; this.maxLength = maxLength; this.stopWords = stopWords; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* If the minLength and maxLength are both 1, then we want unigrams * Make use of a StopAnalyzer when stopwords should be removed * Make use of a SimpleAnalyzer when stop words should be included */ if ((minLength == 1) && (maxLength == 1)) { if (this.stopWords) { analyzer = new StopAnalyzer(Version.LUCENE_43); } else { analyzer = new SimpleAnalyzer(Version.LUCENE_43); } } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_42), minLength, maxLength, " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words. } else { analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(Version.LUCENE_42), minLength, maxLength, " ", false, false); } } //Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); int tokenCount = 0; tokenStream.reset(); //System.out.println("So this is:" + charTermAttribute.toString() ); while (tokenStream.incrementToken()) { //System.out.println("Lets see"); int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); //The actual token term nGrams.add(termToken); //Add all ngrams to the ngram LinkedList } //Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }