List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:com.pongasoft.kiwidoc.index.impl.keyword.impl.KeywordIndexImpl.java
License:Apache License
/** * Generates a simple query: a boolean query made of TermQuery separated * by AND./*from w w w . ja v a2 s . c om*/ * * @param query * @return <code>null</code> if there is no terms * @throws ParseException */ private Query generateSimpleQuery(String keyword, String field) throws ParseException { int termCount = 0; TokenStream source = _analyzer.tokenStream(field, new StringReader(keyword)); BooleanQuery q = new BooleanQuery(); org.apache.lucene.analysis.Token t = new org.apache.lucene.analysis.Token(); while (true) { try { t = source.next(t); } catch (IOException e) { if (log.isDebugEnabled()) log.debug("ingnored exception", e); t = null; } if (t == null) break; termCount++; q.add(new TermQuery(new Term(field, t.term())), BooleanClause.Occur.MUST); } try { source.close(); } catch (IOException e) { if (log.isDebugEnabled()) log.debug("ingnored exception", e); } if (termCount == 0) { return null; } BooleanClause[] clauses = q.getClauses(); if (clauses != null && clauses.length == 1) return clauses[0].getQuery(); return q; }
From source file:com.ProcessText.java
public static void main(String[] args) throws FileNotFoundException, IOException { HashMap<String, Integer> unigram = new HashMap<>(); HashMap<String, Integer> bigram = new HashMap<>(); HashMap<String, Integer> trigram = new HashMap<>(); BufferedReader br = new BufferedReader(new FileReader("D:/phrases90")); String line;/*ww w .ja v a2 s.c o m*/ Analyzer biAnalyzer = new NGramTokenBaseAnalyzer(2, 2, false); Analyzer triAnalyzer = new NGramTokenBaseAnalyzer(3, 3, false); while ((line = br.readLine()) != null) { line = line.replaceAll("\\s+", " ").trim(); //Loai bo 1 so ky hieu line = line.replaceAll("<3", ""); line = line.replaceAll(":3", ""); line = line.replaceAll(":v", ""); line = line.replaceAll(":d", ""); line = line.replaceAll(":D", ""); line = line.replaceAll("p/s:", ""); line = line.replaceAll(":\\)", ""); //unigram process String[] arr = line.split("\\s"); for (String item : arr) { item = item.replaceAll("\\s", ""); if (item.length() > 0) { item = item.toLowerCase(); Integer freq = unigram.get(item); if (freq != null) { unigram.put(item, freq + 1); } else unigram.put(item, 1); } } //bigram process if (line.length() > 0) { TokenStream stream = biAnalyzer.tokenStream(null, new StringReader(line)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String item = cattr.toString(); //item = item.replaceAll("$[\\s]",""); Integer count = bigram.get(item); int fcount = 0; if (count == null) fcount = 1; else fcount = count + 1; if (item.length() > 3) bigram.put(item, fcount); } stream.end(); stream.close(); //trigram process TokenStream stream1 = triAnalyzer.tokenStream(null, new StringReader(line)); CharTermAttribute cattr1 = stream1.addAttribute(CharTermAttribute.class); stream1.reset(); while (stream1.incrementToken()) { String item = cattr1.toString(); //item = item.replaceAll("$[\\s]",""); Integer count = trigram.get(item); int fcount = 0; if (count == null) fcount = 1; else fcount = count + 1; if (item.length() > 5) trigram.put(item, fcount); } stream1.end(); stream1.close(); } } //Tinh Xac suat cho cac unigram HashMap<String, Double> unigramProb = new HashMap<>(); int totalUniFreq = 0; int uniSize = unigram.size(); for (String item : unigram.keySet()) { item = item.toLowerCase(); int freq = unigram.get(item); totalUniFreq += freq; } //Cng thc xc sut di y c sa li for (String item : unigram.keySet()) { item = item.toLowerCase(); int freq = unigram.get(item); double prob = ((double) freq + 1) / (totalUniFreq + uniSize); //unigram.size l s lng t vng unigram khc nhau unigramProb.put(item, prob); } System.out.println("Tong tan suat cua unigram = " + totalUniFreq); //Tinh xac suat cho cac bigram HashMap<String, Double> bigramProb = new HashMap<>(); HashMap<String, Integer> startUnigramOfBigram = new HashMap<>();// Luu tong tan suat cua A* bat dau boi unigram A //De phuc vu cong thuc xac suat co dieu kien int totalBiFreq = 0;//Tinh tong tan suat cua toan bo bigram A* cua unigram A //Luu A* for (String item : bigram.keySet()) { item = item.toLowerCase(); int freq = bigram.get(item); totalBiFreq += freq; String[] arr = item.split("\\s"); String key = arr[0].toLowerCase();//khong can thiet lam Integer startFreq = startUnigramOfBigram.get(key); if (startFreq == null) startUnigramOfBigram.put(key, freq); else startUnigramOfBigram.put(key, freq + startFreq); } //Ap dung cong thuc xac suat co dieu kien //? sa li cng thc for (String item : bigram.keySet()) { int freq = bigram.get(item); String[] arr = item.split("\\s"); String key = arr[0].toLowerCase(); int startUniFreq = startUnigramOfBigram.get(key); double startUniProb; try { startUniProb = unigramProb.get(key); } catch (NullPointerException ex) { startUniProb = 1d / (1 + uniSize); } double prob = (((double) freq + 1) / (startUniFreq + uniSize)) * startUniProb; //uniSize = V l kch thc t in unigram bigramProb.put(item, prob); } System.out.println("Tong tan suat cua bigram = " + totalBiFreq); //Tinh xac suat cho cac trigram HashMap<String, Double> trigramProb = new HashMap<>(); HashMap<String, Integer> startBigramOfTrigram = new HashMap<>();// Luu tong tan suat cua AB* bat dau boi bigram AB int totalTriFreq = 0; for (String item : trigram.keySet()) { int freq = trigram.get(item); totalTriFreq += freq; String[] arr = item.split("\\s"); String key = arr[0] + " " + arr[1]; Integer startFreq = startBigramOfTrigram.get(key); if (startFreq == null) startBigramOfTrigram.put(key, freq); else startBigramOfTrigram.put(key, freq + startFreq); } //Ap dung cong thuc xac suat co dieu kien for (String item : trigram.keySet()) { double startBiProb; int freq = trigram.get(item); String[] arr = item.split("\\s"); String key = arr[0] + " " + arr[1]; //try { int startBiFreq = startBigramOfTrigram.get(key); try { startBiProb = bigramProb.get(key); } catch (NullPointerException ex) { startBiProb = 1d / (878592 + uniSize); } double prob = (((double) freq + 1) / (startBiFreq + uniSize)) * startBiProb; trigramProb.put(item, prob); //} catch(NullPointerException ex) { //} } System.out.println("Tong tan suat cua trigram = " + totalTriFreq); //In ra file PrintWriter f0 = new PrintWriter(new FileWriter("D:/App/unigramProb.txt")); PrintWriter f1 = new PrintWriter(new FileWriter("D:/App/bigramProb.txt")); PrintWriter f2 = new PrintWriter(new FileWriter("D:/App/trigramProb.txt")); for (String item : unigramProb.keySet()) { double freq = unigramProb.get(item); f0.append(item + " = " + freq + "\n"); } f0.close(); for (String item : bigramProb.keySet()) { double freq = bigramProb.get(item); f1.append(item + " = " + freq + "\n"); } f1.close(); for (String item : trigramProb.keySet()) { double freq = trigramProb.get(item); f2.append(item + " = " + freq + "\n"); } f2.close(); PrintWriter f3 = new PrintWriter(new FileWriter("D:/App/stringProb.txt")); br = new BufferedReader(new FileReader("D:/phrases10")); HashMap<String, Integer> prefix3Gram = new HashMap<>(); HashMap<String, Integer> phrases = new HashMap<>(); while ((line = br.readLine()) != null) { line = line.replaceAll("\\s+", " ").trim(); //Loai bo 1 so ky hieu line = line.replaceAll("<3", ""); line = line.replaceAll(":3", ""); line = line.replaceAll(":v", ""); line = line.replaceAll(":d", ""); line = line.replaceAll(":D", ""); line = line.replaceAll("p/s:", ""); line = line.replaceAll(":\\)", ""); String[] arr = line.split("\\s"); if (arr.length > 2) { String prefix = arr[0] + " " + arr[1] + " " + arr[2]; Integer prefixFreq = prefix3Gram.get(prefix); if (prefixFreq == null) prefix3Gram.put(prefix, 1); else prefix3Gram.put(prefix, 1 + prefixFreq); } Integer freq = phrases.get(line); if (freq == null) phrases.put(line, 1); else phrases.put(line, freq + 1); } //br = new BufferedReader(new FileReader("D:/phrases10")); double totalProb = 0; int countItem = 0; for (String item : phrases.keySet()) { line = item; Integer lineFreq = phrases.get(item); if (lineFreq == null) lineFreq = 1; String[] arr = line.split("\\s"); String prefix = line; double probOfLine = 1d * lineFreq / (uniSize + totalTriFreq / uniSize); int length = arr.length; if (length >= 3) { prefix = arr[0] + " " + arr[1] + " " + arr[2]; int prefixTotal = prefix3Gram.get(prefix); try { double prefixProb = trigramProb.get(prefix); probOfLine = prefixProb; if (length > 3) { for (int i = 3; i < length; i++) { prefix = arr[i - 2] + " " + arr[i - 1] + " " + arr[i]; prefixTotal = prefix3Gram.get(prefix); prefixProb = trigramProb.get(prefix); probOfLine *= (1d / prefixTotal) * prefixProb; } } //f3.append(line + " = " + probOfLine + "\n"); } catch (NullPointerException ex) { probOfLine = 1d * lineFreq / (prefixTotal + uniSize); } } f3.append(line + " = " + probOfLine + "\n"); countItem += arr.length; totalProb -= (Math.log(probOfLine) / Math.log(2)); } double somu = totalProb / countItem; double perplexity = Math.pow(2, somu); f3.close(); DecimalFormat df = new DecimalFormat("#"); df.setMaximumFractionDigits(4); System.out.println(somu); System.out.printf("PERPLEXITY = " + df.format(perplexity)); }
From source file:com.qwazr.search.analysis.AnalyzerUtils.java
License:Apache License
final static public void forEachTerm(Analyzer analyzer, String field, String text, TermConsumer consumer) throws IOException { Objects.requireNonNull(analyzer, "The analyzer cannot be null"); Objects.requireNonNull(field, "The field cannot be null"); Objects.requireNonNull(text, "The text cannot be null"); final TokenStream tokenStream = analyzer.tokenStream(field, text); try {//www . j a va 2 s .c o m final CharTermAttribute charTermAttr = getAttribute(tokenStream, CharTermAttribute.class); final FlagsAttribute flagsAttr = getAttribute(tokenStream, FlagsAttribute.class); final OffsetAttribute offsetAttr = getAttribute(tokenStream, OffsetAttribute.class); final PositionIncrementAttribute posIncAttr = getAttribute(tokenStream, PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAttr = getAttribute(tokenStream, PositionLengthAttribute.class); final TypeAttribute typeAttr = getAttribute(tokenStream, TypeAttribute.class); final KeywordAttribute keywordAttr = getAttribute(tokenStream, KeywordAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) if (!consumer.apply(charTermAttr, flagsAttr, offsetAttr, posIncAttr, posLengthAttr, typeAttr, keywordAttr)) break; } finally { tokenStream.close(); } }
From source file:com.romeikat.datamessie.core.base.util.ParseUtil.java
License:Open Source License
public List<String> parseTerms(final String text, final Analyzer analyzer) { final List<String> terms = new LinkedList<String>(); try {/*from www .j av a2 s . com*/ final TokenStream tokenStream = analyzer.tokenStream(null, text); tokenStream.reset(); final Attribute attribute = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { final String term = attribute.toString(); terms.add(term); } tokenStream.end(); tokenStream.close(); } catch (final IOException e) { // Cannot be thrown due to usage of a StringReader } return terms; }
From source file:com.scaleunlimited.classify.analyzer.LuceneAnalyzer.java
License:Apache License
/** * @param contentText input text to be parsed into terms * @return salient terms in order of appearance * (or null if this content should be ignored) *//*w w w. j a v a 2s . co m*/ public List<String> getTermList(String contentText) { init(); List<String> result = new ArrayList<String>(contentText.length() / 10); try { TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = termAtt.toString(); // Here we skip runs of position increment markers created // by the ShingleFilter for stop words because they skew // the clustering/liblinear analysis. if (!term.matches("(_ )*_")) { result.add(term); } } } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }
From source file:com.searchcode.app.util.CodeAnalyzer.java
License:Open Source License
public static void main(String[] args) throws IOException { // text to tokenize final String text = "This is a demo of the TokenStream API"; CodeAnalyzer analyzer = new CodeAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the CharTermAttribute from the TokenStream CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); try {//from w w w . j a v a 2s . c o m stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println(termAtt.toString()); } stream.end(); } finally { stream.close(); } }
From source file:com.shaie.annots.AnnotatingTokenStreamExample.java
License:Apache License
public static void main(String[] args) throws Exception { String text = "quick brown fox ate the blue red chicken"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer); TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);/*from w w w. j av a 2s . c o m*/ System.out.println("Text tokens:\n"); // consume all the tokens from the original stream. this also populates the // Sink (colors) with its color-matching tokens teeSink.reset(); CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class); PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class); int termsPos = -1; while (teeSink.incrementToken()) { termsPos += termPosAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + termsPos); } teeSink.end(); tokenizer.end(); System.out.println("\nAnnotation tokens:\n"); // now consume the color annotation tokens from the colors stream CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class); ByteArrayDataInput in = new ByteArrayDataInput(); colors.reset(); while (colors.incrementToken()) { BytesRef bytes = payloadAtt.getPayload(); in.reset(bytes.bytes, bytes.offset, bytes.length); System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt()); } colors.end(); colors.close(); teeSink.close(); tokenizer.close(); }
From source file:com.shaie.SynonymFilterExample.java
License:Apache License
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymGraphFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset();/*from w ww . ja v a2 s . c o m*/ int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); } ts.end(); ts.close(); }
From source file:com.sindicetech.siren.analysis.filter.TestURINormalisationFilter.java
License:Open Source License
public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages, final String[] expectedTypes) throws Exception { assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null;//from ww w. ja v a 2 s . c o m if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } t.setReader(new StringReader(input)); t.reset(); final TokenStream filter = new URINormalisationFilter(t); for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } } assertFalse("end of stream", filter.incrementToken()); filter.end(); filter.close(); }
From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java
License:Open Source License
public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode, final int[] expectedPos) throws Exception { final TokenStream t = a.tokenStream("", new StringReader(input)); t.reset();/*from w ww . j av a2 s . c om*/ assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null; if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } NodeAttribute nodeAtt = null; if (expectedNode != null) { assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class)); nodeAtt = t.getAttribute(NodeAttribute.class); } PositionAttribute posAtt = null; if (expectedPos != null) { assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class)); posAtt = t.getAttribute(PositionAttribute.class); } for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", t.incrementToken()); assertEquals("i=" + i, expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedNode != null) { assertEquals(expectedNode[i], nodeAtt.node()); } if (expectedPos != null) { assertEquals(expectedPos[i], posAtt.position()); } } assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken()); t.end(); t.close(); }