List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:com.lorelib.analyzer.sample.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //IK?smart??/*from w w w . ja v a 2 s .c o m*/ Analyzer analyzer = new IKAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:com.mathworks.xzheng.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokens(TokenStream stream) throws IOException { OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); System.out.print("[" + charTermAttribute.toString() + "] "); //B }//w ww .j a v a 2 s . co m }
From source file:com.memonews.mahout.sentiment.SentimentModelHelper.java
License:Apache License
private static void countWords(final Analyzer analyzer, final Collection<String> words, final Reader in, final Multiset<String> overallCounts) throws IOException { final TokenStream ts = analyzer.reusableTokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { final String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s);//from w w w . j a va2s.co m } overallCounts.addAll(words); }
From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//from ww w.j a v a 2s . c om StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); Closeables.close(stream, true); //drop stop words document = StopWordsHandler.dropStopWords(document); context.write(key, document); }
From source file:com.o19s.solr.swan.highlight.SwanHighlighter.java
License:Apache License
private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { final SolrIndexSearcher searcher = req.getSearcher(); final IndexSchema schema = searcher.getSchema(); // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) - // so we disable them until fixed (see LUCENE-3080)! // BEGIN: Hack final SchemaField schemaField = schema.getFieldOrNull(fieldName); if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField) || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField))) return;/*w w w.j a v a 2s. c om*/ // END: Hack SolrParams params = req.getParams(); IndexableField[] docFields = doc.getFields(fieldName); List<String> listFields = new ArrayList<String>(); for (IndexableField field : docFields) { listFields.add(field.stringValue()); } String[] docTexts = listFields.toArray(new String[listFields.size()]); // according to Document javadoc, doc.getValues() never returns null. check empty instead of null if (docTexts.length == 0) return; TokenStream tokenStream; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization try { // TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName); // if (tvStream != null) { // tots = new TermOffsetsTokenStream(tvStream); // } } catch (IllegalArgumentException e) { // No problem. But we can't use TermOffsets optimization. } for (int j = 0; j < docTexts.length; j++) { if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tokenStream = tots.getMultiValuedTokenStream(docTexts[j].length()); } else { // fall back to analyzer tokenStream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { if (maxCharsToAnalyze < 0) { tokenStream = new CachingTokenFilter(tokenStream); } else { tokenStream = new CachingTokenFilter( new OffsetLimitTokenFilter(tokenStream, maxCharsToAnalyze)); } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tokenStream); // after highlighter initialization, reset tstream since construction of highlighter already used it tokenStream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tokenStream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first Collections.sort(frags, new Comparator<TextFragment>() { public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes String[] summaries = null; if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } if (fragTexts.size() >= numFragments) break; } summaries = (String[]) fragTexts.toArray(); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
From source file:com.o19s.solr.swan.highlight.SwanHighlighter.java
License:Apache License
private TokenStream createAnalyzerTStream(IndexSchema schema, String fieldName, String docText) throws IOException { TokenStream tstream;//from ww w . j a v a 2 s . c o m TokenStream ts = schema.getAnalyzer().tokenStream(fieldName, new StringReader(docText)); ts.reset(); tstream = new TokenOrderingFilter(ts, 10); return tstream; }
From source file:com.plug.Version_8_5_2.gs.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string using * GsAnalyzer.//from w w w.j a va 2 s .co m * * @param p_text * fuzzy match format string * @return List of c.g.l.tm2.index.Tokens */ public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); // GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class); // org.apache.lucene.analysis.Token luceneToken = null; List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { // luceneToken = gsAtt.getToken(); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return buildTokenList(tokens); }
From source file:com.ProcessText.java
public static void main(String[] args) throws FileNotFoundException, IOException { HashMap<String, Integer> unigram = new HashMap<>(); HashMap<String, Integer> bigram = new HashMap<>(); HashMap<String, Integer> trigram = new HashMap<>(); BufferedReader br = new BufferedReader(new FileReader("D:/phrases90")); String line;// w w w .j a va 2 s . co m Analyzer biAnalyzer = new NGramTokenBaseAnalyzer(2, 2, false); Analyzer triAnalyzer = new NGramTokenBaseAnalyzer(3, 3, false); while ((line = br.readLine()) != null) { line = line.replaceAll("\\s+", " ").trim(); //Loai bo 1 so ky hieu line = line.replaceAll("<3", ""); line = line.replaceAll(":3", ""); line = line.replaceAll(":v", ""); line = line.replaceAll(":d", ""); line = line.replaceAll(":D", ""); line = line.replaceAll("p/s:", ""); line = line.replaceAll(":\\)", ""); //unigram process String[] arr = line.split("\\s"); for (String item : arr) { item = item.replaceAll("\\s", ""); if (item.length() > 0) { item = item.toLowerCase(); Integer freq = unigram.get(item); if (freq != null) { unigram.put(item, freq + 1); } else unigram.put(item, 1); } } //bigram process if (line.length() > 0) { TokenStream stream = biAnalyzer.tokenStream(null, new StringReader(line)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String item = cattr.toString(); //item = item.replaceAll("$[\\s]",""); Integer count = bigram.get(item); int fcount = 0; if (count == null) fcount = 1; else fcount = count + 1; if (item.length() > 3) bigram.put(item, fcount); } stream.end(); stream.close(); //trigram process TokenStream stream1 = triAnalyzer.tokenStream(null, new StringReader(line)); CharTermAttribute cattr1 = stream1.addAttribute(CharTermAttribute.class); stream1.reset(); while (stream1.incrementToken()) { String item = cattr1.toString(); //item = item.replaceAll("$[\\s]",""); Integer count = trigram.get(item); int fcount = 0; if (count == null) fcount = 1; else fcount = count + 1; if (item.length() > 5) trigram.put(item, fcount); } stream1.end(); stream1.close(); } } //Tinh Xac suat cho cac unigram HashMap<String, Double> unigramProb = new HashMap<>(); int totalUniFreq = 0; int uniSize = unigram.size(); for (String item : unigram.keySet()) { item = item.toLowerCase(); int freq = unigram.get(item); totalUniFreq += freq; } //Cng thc xc sut di y c sa li for (String item : unigram.keySet()) { item = item.toLowerCase(); int freq = unigram.get(item); double prob = ((double) freq + 1) / (totalUniFreq + uniSize); //unigram.size l s lng t vng unigram khc nhau unigramProb.put(item, prob); } System.out.println("Tong tan suat cua unigram = " + totalUniFreq); //Tinh xac suat cho cac bigram HashMap<String, Double> bigramProb = new HashMap<>(); HashMap<String, Integer> startUnigramOfBigram = new HashMap<>();// Luu tong tan suat cua A* bat dau boi unigram A //De phuc vu cong thuc xac suat co dieu kien int totalBiFreq = 0;//Tinh tong tan suat cua toan bo bigram A* cua unigram A //Luu A* for (String item : bigram.keySet()) { item = item.toLowerCase(); int freq = bigram.get(item); totalBiFreq += freq; String[] arr = item.split("\\s"); String key = arr[0].toLowerCase();//khong can thiet lam Integer startFreq = startUnigramOfBigram.get(key); if (startFreq == null) startUnigramOfBigram.put(key, freq); else startUnigramOfBigram.put(key, freq + startFreq); } //Ap dung cong thuc xac suat co dieu kien //? sa li cng thc for (String item : bigram.keySet()) { int freq = bigram.get(item); String[] arr = item.split("\\s"); String key = arr[0].toLowerCase(); int startUniFreq = startUnigramOfBigram.get(key); double startUniProb; try { startUniProb = unigramProb.get(key); } catch (NullPointerException ex) { startUniProb = 1d / (1 + uniSize); } double prob = (((double) freq + 1) / (startUniFreq + uniSize)) * startUniProb; //uniSize = V l kch thc t in unigram bigramProb.put(item, prob); } System.out.println("Tong tan suat cua bigram = " + totalBiFreq); //Tinh xac suat cho cac trigram HashMap<String, Double> trigramProb = new HashMap<>(); HashMap<String, Integer> startBigramOfTrigram = new HashMap<>();// Luu tong tan suat cua AB* bat dau boi bigram AB int totalTriFreq = 0; for (String item : trigram.keySet()) { int freq = trigram.get(item); totalTriFreq += freq; String[] arr = item.split("\\s"); String key = arr[0] + " " + arr[1]; Integer startFreq = startBigramOfTrigram.get(key); if (startFreq == null) startBigramOfTrigram.put(key, freq); else startBigramOfTrigram.put(key, freq + startFreq); } //Ap dung cong thuc xac suat co dieu kien for (String item : trigram.keySet()) { double startBiProb; int freq = trigram.get(item); String[] arr = item.split("\\s"); String key = arr[0] + " " + arr[1]; //try { int startBiFreq = startBigramOfTrigram.get(key); try { startBiProb = bigramProb.get(key); } catch (NullPointerException ex) { startBiProb = 1d / (878592 + uniSize); } double prob = (((double) freq + 1) / (startBiFreq + uniSize)) * startBiProb; trigramProb.put(item, prob); //} catch(NullPointerException ex) { //} } System.out.println("Tong tan suat cua trigram = " + totalTriFreq); //In ra file PrintWriter f0 = new PrintWriter(new FileWriter("D:/App/unigramProb.txt")); PrintWriter f1 = new PrintWriter(new FileWriter("D:/App/bigramProb.txt")); PrintWriter f2 = new PrintWriter(new FileWriter("D:/App/trigramProb.txt")); for (String item : unigramProb.keySet()) { double freq = unigramProb.get(item); f0.append(item + " = " + freq + "\n"); } f0.close(); for (String item : bigramProb.keySet()) { double freq = bigramProb.get(item); f1.append(item + " = " + freq + "\n"); } f1.close(); for (String item : trigramProb.keySet()) { double freq = trigramProb.get(item); f2.append(item + " = " + freq + "\n"); } f2.close(); PrintWriter f3 = new PrintWriter(new FileWriter("D:/App/stringProb.txt")); br = new BufferedReader(new FileReader("D:/phrases10")); HashMap<String, Integer> prefix3Gram = new HashMap<>(); HashMap<String, Integer> phrases = new HashMap<>(); while ((line = br.readLine()) != null) { line = line.replaceAll("\\s+", " ").trim(); //Loai bo 1 so ky hieu line = line.replaceAll("<3", ""); line = line.replaceAll(":3", ""); line = line.replaceAll(":v", ""); line = line.replaceAll(":d", ""); line = line.replaceAll(":D", ""); line = line.replaceAll("p/s:", ""); line = line.replaceAll(":\\)", ""); String[] arr = line.split("\\s"); if (arr.length > 2) { String prefix = arr[0] + " " + arr[1] + " " + arr[2]; Integer prefixFreq = prefix3Gram.get(prefix); if (prefixFreq == null) prefix3Gram.put(prefix, 1); else prefix3Gram.put(prefix, 1 + prefixFreq); } Integer freq = phrases.get(line); if (freq == null) phrases.put(line, 1); else phrases.put(line, freq + 1); } //br = new BufferedReader(new FileReader("D:/phrases10")); double totalProb = 0; int countItem = 0; for (String item : phrases.keySet()) { line = item; Integer lineFreq = phrases.get(item); if (lineFreq == null) lineFreq = 1; String[] arr = line.split("\\s"); String prefix = line; double probOfLine = 1d * lineFreq / (uniSize + totalTriFreq / uniSize); int length = arr.length; if (length >= 3) { prefix = arr[0] + " " + arr[1] + " " + arr[2]; int prefixTotal = prefix3Gram.get(prefix); try { double prefixProb = trigramProb.get(prefix); probOfLine = prefixProb; if (length > 3) { for (int i = 3; i < length; i++) { prefix = arr[i - 2] + " " + arr[i - 1] + " " + arr[i]; prefixTotal = prefix3Gram.get(prefix); prefixProb = trigramProb.get(prefix); probOfLine *= (1d / prefixTotal) * prefixProb; } } //f3.append(line + " = " + probOfLine + "\n"); } catch (NullPointerException ex) { probOfLine = 1d * lineFreq / (prefixTotal + uniSize); } } f3.append(line + " = " + probOfLine + "\n"); countItem += arr.length; totalProb -= (Math.log(probOfLine) / Math.log(2)); } double somu = totalProb / countItem; double perplexity = Math.pow(2, somu); f3.close(); DecimalFormat df = new DecimalFormat("#"); df.setMaximumFractionDigits(4); System.out.println(somu); System.out.printf("PERPLEXITY = " + df.format(perplexity)); }
From source file:com.qwazr.search.analysis.AnalyzerUtils.java
License:Apache License
final static public void forEachTerm(Analyzer analyzer, String field, String text, TermConsumer consumer) throws IOException { Objects.requireNonNull(analyzer, "The analyzer cannot be null"); Objects.requireNonNull(field, "The field cannot be null"); Objects.requireNonNull(text, "The text cannot be null"); final TokenStream tokenStream = analyzer.tokenStream(field, text); try {//w w w . jav a 2 s . c o m final CharTermAttribute charTermAttr = getAttribute(tokenStream, CharTermAttribute.class); final FlagsAttribute flagsAttr = getAttribute(tokenStream, FlagsAttribute.class); final OffsetAttribute offsetAttr = getAttribute(tokenStream, OffsetAttribute.class); final PositionIncrementAttribute posIncAttr = getAttribute(tokenStream, PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAttr = getAttribute(tokenStream, PositionLengthAttribute.class); final TypeAttribute typeAttr = getAttribute(tokenStream, TypeAttribute.class); final KeywordAttribute keywordAttr = getAttribute(tokenStream, KeywordAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) if (!consumer.apply(charTermAttr, flagsAttr, offsetAttr, posIncAttr, posLengthAttr, typeAttr, keywordAttr)) break; } finally { tokenStream.close(); } }
From source file:com.qwazr.search.query.SpanPositionsQuery.java
License:Apache License
@Override final public Query getQuery(QueryContext queryContext) throws IOException { BooleanQuery.Builder builder = new BooleanQuery.Builder(); TokenStream tokenStream = queryContext.analyzer.tokenStream(field, queryContext.queryString); try {/*from w w w . java2 s .co m*/ CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute pocincrAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); tokenStream.reset(); int pos = 0; while (tokenStream.incrementToken()) { final String charTerm = charTermAttribute.toString(); int start = pos - distance; if (start < 0) start = 0; final int end = pos + distance + 1; for (int i = start; i < end; i++) { final float dist = Math.abs(i - pos) + 1; final float boost = 1 / dist; final SpanTermQuery spanTermQuery = new SpanTermQuery(new Term(field, charTerm)); Query query = new BoostQuery(new SpanPositionRangeQuery(spanTermQuery, i, i + 1), boost); builder.add(new BooleanClause(query, BooleanClause.Occur.SHOULD)); } pos += pocincrAttribute.getPositionIncrement(); } return builder.build(); } finally { IOUtils.closeQuietly(tokenStream); } }