List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException { ArrayList<String> tokens = new ArrayList<String>(); TokenStream ts = analyzer.tokenStream(field, value); ts.reset();//www .ja v a2 s . c om while (ts.incrementToken()) { CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); String token = new String(termAttribute.buffer(), 0, termAttribute.length()); tokens.add(token); } ts.end(); ts.close(); return tokens; }
From source file:com.jaeksoft.searchlib.request.SearchField.java
License:Open Source License
final private List<TermQueryItem> getTermQueryFilter(final PerFieldAnalyzer perFieldAnalyzer, CompiledAnalyzer compiledAnalyzer, final String queryString) throws IOException { TokenStream ts = null; TokenQueryFilter.TermQueryFilter tqf = null; Analyzer analyzer = compiledAnalyzer != null ? compiledAnalyzer : perFieldAnalyzer.getKeywordAnalyzer(); try {/*from w w w . j a v a2 s . c om*/ ts = analyzer.tokenStream(field, new StringReader(queryString)); tqf = new TermQueryFilter(compiledAnalyzer, field, (float) termBoost, ts); while (tqf.incrementToken()) ; ts.end(); ts.close(); tqf.sortByOffset(); TermQueryFilter.includeChildrenBrothers(tqf.termQueryItems); for (TermQueryItem termQueryItem : tqf.termQueryItems) termQueryItem.includeChildrenBrothers(); return tqf.termQueryItems; } finally { IOUtils.close(tqf, ts, analyzer); } }
From source file:com.jamespot.glifpix.index.ResourceDocument.java
License:Open Source License
private void addLiteralField(String literal) throws IOException { _luceneDocument/*from w w w. ja v a 2 s . c o m*/ .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); String coolLiteral = literal.replaceAll("\\\"", ""); coolLiteral = replaceUnicodeStr(coolLiteral); Analyzer resAnalyzer = new ContentAnalyzer(); TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); int length = 0; StringBuffer sb = new StringBuffer(); while (ts.incrementToken()) { sb.append("_" + termAttribute.term()); length++; } sb.insert(0, length); _resourceLength = length; ts.end(); ts.close(); String finalToken = sb.toString(); _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS)); _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Map<String, Integer> getTagsFreq(String content, String lng) { Map<String, Integer> items = new HashMap<String, Integer>(); TokensArray tokArray = new TokensArray(_MaxExpressionLength); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {/* w w w . j a v a 2 s. c o m*/ while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { String tag = _resStores.get(lng).getTag(s.getKey()); if (tag != null && tag.length() >= _MinWordLength) { if (items.containsKey(tag)) { items.put(tag, items.get(tag) + s.getValue()); } else { items.put(tag, s.getValue()); } } } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return items; }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Map<String, Float> getWeightedTagsFreq(String content, String lng) { Map<String, Float> items = new HashMap<String, Float>(); TokensArray tokArray = new TokensArray(_MaxExpressionLength); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {/*from w w w. j a v a 2 s . co m*/ while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { String tag = _resStores.get(lng).getTag(s.getKey()); if (tag != null && tag.length() >= _MinWordLength) { if (items.containsKey(tag)) { items.put(tag, items.get(tag) + (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng)); } else { items.put(tag, (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng)); } } } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return items; }
From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java
License:Open Source License
public Set<String> getTokens(String content, String lng) { Set<String> tokens = new HashSet<String>(); TokensArray tokArray = new TokensArray(15); TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content)); TermAttribute termAttribute = ts.addAttribute(TermAttribute.class); try {//w w w .j a v a 2 s . c om while (ts.incrementToken()) { tokArray.pushString(termAttribute.term()); Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(), _lngStopTags.get(lng)); if (tagCandidates.size() > 0) { for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) { tokens.add(s.getKey()); } } } ts.end(); ts.close(); } catch (IOException e) { logger.error(e); } return tokens; }
From source file:com.lorelib.analyzer.sample.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //IK?smart??//from w ww . j av a2 s . co m Analyzer analyzer = new IKAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); stream.reset();//from w w w . j a va2 s. com CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); Closeables.close(stream, true); //drop stop words document = StopWordsHandler.dropStopWords(document); context.write(key, document); }
From source file:com.nec.scg.senseRanking.CountTextSimilarity.java
public Map<String, Float> CountTF_IDF(String str, Analyzer a) { Map<String, Float> termVector = new TreeMap<String, Float>(); try {/*from w w w . j a v a 2 s . c om*/ TokenStream stream = a.tokenStream("content", new StringReader(str)); PorterStemFilter filter = new PorterStemFilter(stream); CharTermAttribute cta = filter.addAttribute(CharTermAttribute.class); filter.reset(); String strcat = null; int wordCount = 0; while (filter.incrementToken()) { strcat = cta.toString(); // System.out.print("["+strcat+"]"); if (!termVector.containsKey(strcat)) { termVector.put(strcat, 1f); wordCount++; } else { termVector.put(strcat, termVector.get(strcat) + 1); wordCount++; } } for (String ter : termVector.keySet()) { int hits = searchIndexforIDF(ter) + 1; float idf = (float) (Math.log(AllArticle * 1.0 / hits) + 1.0); float tf = termVector.get(ter) / wordCount; termVector.put(ter, tf * idf); } filter.end(); stream.end(); filter.close(); stream.close(); } catch (IOException e) { e.printStackTrace(); } return termVector; }
From source file:com.ProcessText.java
public static void main(String[] args) throws FileNotFoundException, IOException { HashMap<String, Integer> unigram = new HashMap<>(); HashMap<String, Integer> bigram = new HashMap<>(); HashMap<String, Integer> trigram = new HashMap<>(); BufferedReader br = new BufferedReader(new FileReader("D:/phrases90")); String line;// www . ja v a2s . c o m Analyzer biAnalyzer = new NGramTokenBaseAnalyzer(2, 2, false); Analyzer triAnalyzer = new NGramTokenBaseAnalyzer(3, 3, false); while ((line = br.readLine()) != null) { line = line.replaceAll("\\s+", " ").trim(); //Loai bo 1 so ky hieu line = line.replaceAll("<3", ""); line = line.replaceAll(":3", ""); line = line.replaceAll(":v", ""); line = line.replaceAll(":d", ""); line = line.replaceAll(":D", ""); line = line.replaceAll("p/s:", ""); line = line.replaceAll(":\\)", ""); //unigram process String[] arr = line.split("\\s"); for (String item : arr) { item = item.replaceAll("\\s", ""); if (item.length() > 0) { item = item.toLowerCase(); Integer freq = unigram.get(item); if (freq != null) { unigram.put(item, freq + 1); } else unigram.put(item, 1); } } //bigram process if (line.length() > 0) { TokenStream stream = biAnalyzer.tokenStream(null, new StringReader(line)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String item = cattr.toString(); //item = item.replaceAll("$[\\s]",""); Integer count = bigram.get(item); int fcount = 0; if (count == null) fcount = 1; else fcount = count + 1; if (item.length() > 3) bigram.put(item, fcount); } stream.end(); stream.close(); //trigram process TokenStream stream1 = triAnalyzer.tokenStream(null, new StringReader(line)); CharTermAttribute cattr1 = stream1.addAttribute(CharTermAttribute.class); stream1.reset(); while (stream1.incrementToken()) { String item = cattr1.toString(); //item = item.replaceAll("$[\\s]",""); Integer count = trigram.get(item); int fcount = 0; if (count == null) fcount = 1; else fcount = count + 1; if (item.length() > 5) trigram.put(item, fcount); } stream1.end(); stream1.close(); } } //Tinh Xac suat cho cac unigram HashMap<String, Double> unigramProb = new HashMap<>(); int totalUniFreq = 0; int uniSize = unigram.size(); for (String item : unigram.keySet()) { item = item.toLowerCase(); int freq = unigram.get(item); totalUniFreq += freq; } //Cng thc xc sut di y c sa li for (String item : unigram.keySet()) { item = item.toLowerCase(); int freq = unigram.get(item); double prob = ((double) freq + 1) / (totalUniFreq + uniSize); //unigram.size l s lng t vng unigram khc nhau unigramProb.put(item, prob); } System.out.println("Tong tan suat cua unigram = " + totalUniFreq); //Tinh xac suat cho cac bigram HashMap<String, Double> bigramProb = new HashMap<>(); HashMap<String, Integer> startUnigramOfBigram = new HashMap<>();// Luu tong tan suat cua A* bat dau boi unigram A //De phuc vu cong thuc xac suat co dieu kien int totalBiFreq = 0;//Tinh tong tan suat cua toan bo bigram A* cua unigram A //Luu A* for (String item : bigram.keySet()) { item = item.toLowerCase(); int freq = bigram.get(item); totalBiFreq += freq; String[] arr = item.split("\\s"); String key = arr[0].toLowerCase();//khong can thiet lam Integer startFreq = startUnigramOfBigram.get(key); if (startFreq == null) startUnigramOfBigram.put(key, freq); else startUnigramOfBigram.put(key, freq + startFreq); } //Ap dung cong thuc xac suat co dieu kien //? sa li cng thc for (String item : bigram.keySet()) { int freq = bigram.get(item); String[] arr = item.split("\\s"); String key = arr[0].toLowerCase(); int startUniFreq = startUnigramOfBigram.get(key); double startUniProb; try { startUniProb = unigramProb.get(key); } catch (NullPointerException ex) { startUniProb = 1d / (1 + uniSize); } double prob = (((double) freq + 1) / (startUniFreq + uniSize)) * startUniProb; //uniSize = V l kch thc t in unigram bigramProb.put(item, prob); } System.out.println("Tong tan suat cua bigram = " + totalBiFreq); //Tinh xac suat cho cac trigram HashMap<String, Double> trigramProb = new HashMap<>(); HashMap<String, Integer> startBigramOfTrigram = new HashMap<>();// Luu tong tan suat cua AB* bat dau boi bigram AB int totalTriFreq = 0; for (String item : trigram.keySet()) { int freq = trigram.get(item); totalTriFreq += freq; String[] arr = item.split("\\s"); String key = arr[0] + " " + arr[1]; Integer startFreq = startBigramOfTrigram.get(key); if (startFreq == null) startBigramOfTrigram.put(key, freq); else startBigramOfTrigram.put(key, freq + startFreq); } //Ap dung cong thuc xac suat co dieu kien for (String item : trigram.keySet()) { double startBiProb; int freq = trigram.get(item); String[] arr = item.split("\\s"); String key = arr[0] + " " + arr[1]; //try { int startBiFreq = startBigramOfTrigram.get(key); try { startBiProb = bigramProb.get(key); } catch (NullPointerException ex) { startBiProb = 1d / (878592 + uniSize); } double prob = (((double) freq + 1) / (startBiFreq + uniSize)) * startBiProb; trigramProb.put(item, prob); //} catch(NullPointerException ex) { //} } System.out.println("Tong tan suat cua trigram = " + totalTriFreq); //In ra file PrintWriter f0 = new PrintWriter(new FileWriter("D:/App/unigramProb.txt")); PrintWriter f1 = new PrintWriter(new FileWriter("D:/App/bigramProb.txt")); PrintWriter f2 = new PrintWriter(new FileWriter("D:/App/trigramProb.txt")); for (String item : unigramProb.keySet()) { double freq = unigramProb.get(item); f0.append(item + " = " + freq + "\n"); } f0.close(); for (String item : bigramProb.keySet()) { double freq = bigramProb.get(item); f1.append(item + " = " + freq + "\n"); } f1.close(); for (String item : trigramProb.keySet()) { double freq = trigramProb.get(item); f2.append(item + " = " + freq + "\n"); } f2.close(); PrintWriter f3 = new PrintWriter(new FileWriter("D:/App/stringProb.txt")); br = new BufferedReader(new FileReader("D:/phrases10")); HashMap<String, Integer> prefix3Gram = new HashMap<>(); HashMap<String, Integer> phrases = new HashMap<>(); while ((line = br.readLine()) != null) { line = line.replaceAll("\\s+", " ").trim(); //Loai bo 1 so ky hieu line = line.replaceAll("<3", ""); line = line.replaceAll(":3", ""); line = line.replaceAll(":v", ""); line = line.replaceAll(":d", ""); line = line.replaceAll(":D", ""); line = line.replaceAll("p/s:", ""); line = line.replaceAll(":\\)", ""); String[] arr = line.split("\\s"); if (arr.length > 2) { String prefix = arr[0] + " " + arr[1] + " " + arr[2]; Integer prefixFreq = prefix3Gram.get(prefix); if (prefixFreq == null) prefix3Gram.put(prefix, 1); else prefix3Gram.put(prefix, 1 + prefixFreq); } Integer freq = phrases.get(line); if (freq == null) phrases.put(line, 1); else phrases.put(line, freq + 1); } //br = new BufferedReader(new FileReader("D:/phrases10")); double totalProb = 0; int countItem = 0; for (String item : phrases.keySet()) { line = item; Integer lineFreq = phrases.get(item); if (lineFreq == null) lineFreq = 1; String[] arr = line.split("\\s"); String prefix = line; double probOfLine = 1d * lineFreq / (uniSize + totalTriFreq / uniSize); int length = arr.length; if (length >= 3) { prefix = arr[0] + " " + arr[1] + " " + arr[2]; int prefixTotal = prefix3Gram.get(prefix); try { double prefixProb = trigramProb.get(prefix); probOfLine = prefixProb; if (length > 3) { for (int i = 3; i < length; i++) { prefix = arr[i - 2] + " " + arr[i - 1] + " " + arr[i]; prefixTotal = prefix3Gram.get(prefix); prefixProb = trigramProb.get(prefix); probOfLine *= (1d / prefixTotal) * prefixProb; } } //f3.append(line + " = " + probOfLine + "\n"); } catch (NullPointerException ex) { probOfLine = 1d * lineFreq / (prefixTotal + uniSize); } } f3.append(line + " = " + probOfLine + "\n"); countItem += arr.length; totalProb -= (Math.log(probOfLine) / Math.log(2)); } double somu = totalProb / countItem; double perplexity = Math.pow(2, somu); f3.close(); DecimalFormat df = new DecimalFormat("#"); df.setMaximumFractionDigits(4); System.out.println(somu); System.out.printf("PERPLEXITY = " + df.format(perplexity)); }