List of usage examples for org.apache.lucene.analysis.en PorterStemFilter incrementToken
@Override
public final boolean incrementToken() throws IOException
From source file:com.nec.scg.senseRanking.CountTextSimilarity.java
public Map<String, Float> CountTF_IDF(String str, Analyzer a) { Map<String, Float> termVector = new TreeMap<String, Float>(); try {//from ww w.j av a2 s . co m TokenStream stream = a.tokenStream("content", new StringReader(str)); PorterStemFilter filter = new PorterStemFilter(stream); CharTermAttribute cta = filter.addAttribute(CharTermAttribute.class); filter.reset(); String strcat = null; int wordCount = 0; while (filter.incrementToken()) { strcat = cta.toString(); // System.out.print("["+strcat+"]"); if (!termVector.containsKey(strcat)) { termVector.put(strcat, 1f); wordCount++; } else { termVector.put(strcat, termVector.get(strcat) + 1); wordCount++; } } for (String ter : termVector.keySet()) { int hits = searchIndexforIDF(ter) + 1; float idf = (float) (Math.log(AllArticle * 1.0 / hits) + 1.0); float tf = termVector.get(ter) / wordCount; termVector.put(ter, tf * idf); } filter.end(); stream.end(); filter.close(); stream.close(); } catch (IOException e) { e.printStackTrace(); } return termVector; }
From source file:nl.uva.p2psearch.Main.java
/** * * @param e s//from ww w . j a v a 2 s. com * @return s * @throws IOException s */ private static List<InvertedIndexEntry> getInvertedIndexEntries(final MetadataEntry e) throws IOException { String text = e.getDescription(); Map<Number160, Integer> dictionary = new HashMap<>(); List<InvertedIndexEntry> list = new ArrayList<>(); Analyzer analyzer = new ArmenianAnalyzer(Version.LUCENE_42, Utils.getCharArrayStopwords()); try (TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text))) { PorterStemFilter psf = new PorterStemFilter(tokenStream); CharTermAttribute term = psf.addAttribute(CharTermAttribute.class); psf.reset(); StringBuilder sb = new StringBuilder(); while (psf.incrementToken()) { Integer tf; Number160 termKey = Number160.createHash(term.toString()); if (dictionary.containsKey(termKey)) { tf = dictionary.get(termKey); tf++; } else { tf = 1; } dictionary.put(termKey, tf); sb.append(term.toString()).append(" "); List<Number160> ll = new ArrayList<>(); ll.add(Number160.createHash(e.getID())); list.add(new InvertedIndexEntry(termKey.toString(), term.toString(), tf, ll)); } StandardTokenizer source = new StandardTokenizer(Version.LUCENE_42, new StringReader(sb.toString())); TokenStream tokenStreamSF = new StandardFilter(Version.LUCENE_42, source); try (ShingleFilter sf = new ShingleFilter(tokenStreamSF, 2, maxNGrams)) { sf.setOutputUnigrams(false); CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class); sf.reset(); while (sf.incrementToken()) { String word = charTermAttribute.toString(); String ng = word.replaceAll(" ", "_"); Integer tf; Number160 termKey = Number160.createHash(ng); if (dictionary.containsKey(termKey)) { tf = dictionary.get(termKey); tf++; } else { tf = 1; } dictionary.put(termKey, tf); List<Number160> ll = new ArrayList<>(); ll.add(Number160.createHash(e.getID())); list.add(new InvertedIndexEntry(termKey.toString(), ng, tf, ll)); } } } return list; }