List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java
License:Apache License
/** * //ww w . j av a 2s. c o m * Counts words * * @param analyzer * @param words * @param in * @throws IOException */ private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { //System.out.println( "> ----- countWords ------" ); // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); //System.out.print( " " + s ); words.add(s); } //System.out.println( "\n<" ); /*overallCounts.addAll(words);*/ }
From source file:com.cloudera.knittingboar.utils.DatasetConverter.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); // System.out.print( " " + s ); words.add(s);//from w w w . j a va2 s .c om } }
From source file:com.cloudera.knittingboar.utils.DatasetConverter.java
License:Apache License
public static String ReadFullFile(Analyzer analyzer, String newsgroup_name, String file) throws IOException { String out = newsgroup_name + "\t"; BufferedReader reader = null; // Collection<String> words Multiset<String> words = ConcurrentHashMultiset.create(); try {/* w ww . j a v a2 s .c o m*/ reader = new BufferedReader(new FileReader(file)); TokenStream ts = analyzer.tokenStream("text", reader); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to // collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); out += s + " "; } } finally { if (reader != null) { reader.close(); } } return out + "\n"; }
From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java
public void performWork(Path doc) throws IOException { try {//from w w w. j a v a 2s .c om System.out.println("performing token work"); HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder part = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); System.out.println(key); String value = pair.getSecond().toString(); part.append(key); TokenStream stream = analyzer.tokenStream(key, new StringReader(value)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); stream.close(); tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(vectorsDir, part.toString()); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); System.out.println("wrote"); } } catch (Exception e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException { String sContent = value.getText(); if (sContent == null) { // no text available? skip context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1); return;// ww w . j ava 2 s . c o m } TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringTuple document = new StringTuple(); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } context.write(key, document); }
From source file:com.digitalpebble.classification.example.TwentyNewsgroups.java
License:Apache License
private List<String> analyseField(String content) throws IOException { if (content == null) return null; List<String> tokens = new ArrayList<String>(); StringReader sr = new StringReader(content); TokenStream ts = analyzer.tokenStream("dummyValue", sr); TermAttribute term = ts.addAttribute(TermAttribute.class); while (ts.incrementToken()) { tokens.add(term.term());//from ww w . j a v a 2 s. com } return tokens; }
From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java
License:Open Source License
public static String analyze(String str, Analyzer analyzer) throws IOException { if (analyzer == null) { return str; }//from ww w.j a va 2s .c o m StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(str)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } return norm.toString(); }
From source file:com.doculibre.constellio.utils.AnalyzerUtils.java
License:Open Source License
public static String analyzePhrase(String phrase, boolean useStopWords) { if (StringUtils.isNotBlank(phrase)) { String analysedPhrase;/*from ww w . jav a 2 s . com*/ Analyzer analyzer = getDefaultAnalyzer(useStopWords); StringBuilder norm = new StringBuilder(); TokenStream tokens; try { tokens = analyzer.tokenStream("", new StringReader(phrase)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } analysedPhrase = norm.toString().trim(); } catch (IOException e) { throw new RuntimeException(e); } return analysedPhrase; } else { return phrase; } }
From source file:com.faqit.similarity.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. Can handle ngrams of any length * and also perform stop word removal before extraction * /*from ww w. j a va 2 s . c om*/ * @param text * the text that the ngrams should be extracted from * @param length * the length of the ngrams * @param stopWords * whether or not stopwords should be removed before extraction * @param overlap * whether or not the ngrams should overlap */ public void extract(String text, int length, Boolean stopWords, Boolean overlap) throws FileNotFoundException, IOException { this.text = text; this.length = length; this.stopWords = stopWords; this.overlap = overlap; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* * If the minLength and maxLength are both 1, then we want unigrams Make * use of a StopAnalyzer when stopwords should be removed Make use of a * SimpleAnalyzer when stop words should be included */ if (length == 1) { if (this.stopWords) { analyzer = new StandardAnalyzer(); } else { analyzer = new SimpleAnalyzer(); } } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once // again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a // hack to use // Lucene 2.4 // since in 2.4 // position // increments // weren't // preserved by // default. // Using a later // version puts // underscores // (_) in the // place of // removed stop // words. } else { analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, ""); } } // Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); // OffsetAttribute offsetAttribute = // tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); // int tokenCount = 0; tokenStream.reset(); while (tokenStream.incrementToken()) { // int startOffset = offsetAttribute.startOffset(); // int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); // The actual token // term nGrams.add(termToken); // Add all ngrams to the ngram LinkedList // If n-grams are not allowed to overlap, then increment to point of // no overlap if (!overlap) { for (int i = 0; i < length - 1; i++) { tokenStream.incrementToken(); } } } // Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }
From source file:com.finderbots.miner.PhraseShingleAnalyzer.java
License:Apache License
public List<String> getTermList(String contentText) { TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); List<String> result = new ArrayList<String>(contentText.length() / 10); try {//from www .j av a 2s.c o m while (stream.incrementToken()) { if (termAtt.termLength() > 0) { String term = termAtt.term(); result.add(term); } } } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }