List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:com.chriscx.stem.Stem.java
public String evaluate(BufferedReader input) { if (input == null) { return null; }/*from ww w . j ava 2 s . c om*/ CharArraySet stopWordsSet = new CharArraySet(Version.LUCENE_46, 10000, true); String stopWords = "a afin ai ainsi aprs attendu au aujourd auquel aussi " + "autre autres aux auxquelles auxquels avait avant avec c car ce " + "ceci cela celle celles celui cependant certain certaine certaines " + "certains ces cet cette ceux chez ci combien comme comment " + "concernant contre d dans de debout dedans dehors del depuis " + "derrire des dsormais desquelles desquels devers devra doit " + "donc dont du duquel durant ds elle elles en entre environ est" + " et etc eu eux except hormis hors hlas hui il ils j je jusqu " + "jusque l la laquelle le lequel les lesquelles lesquels leur leurs " + "lorsque lui l ma mais malgr me merci mes mien mienne miennes " + "miens moins mon moyennant mme mmes n ne ni non nos notre nous " + "nanmoins ntre ntres on ont ou outre o par parmi partant pas " + "pass pendant plein plus plusieurs pour pourquoi proche prs " + "puisque qu quand que quel quelle quelles quels qui quoi quoique" + " revoici revoil s sa sans sauf se selon seront ses si sien " + "sienne siennes siens sinon soi soit son sont sous suivant sur " + "ta te tes tien tienne tiennes tiens ton tous tout toute toutes" + " tu un une va vers voici voil vos votre vous vu vtre vtres y " + " t tre "; String[] stopWordsTab = stopWords.split(" "); for (String word : stopWordsTab) { stopWordsSet.add(word); } Analyzer analyzer = new FrenchAnalyzer(Version.LUCENE_46, stopWordsSet); result = ""; try { String line = input.readLine(); line = line.replaceAll("(\\S)+@(\\S)+.(\\S)+", ""); line = line.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", ""); line = line.replaceAll("(_|-)+", ""); line = line.replaceAll("(\\n|\\r|\\t)+", ""); line = line.replaceAll("(?![\\._])\\p{P}", ""); while (line != null) { TokenStream stream = analyzer.tokenStream(null, line); stream.reset(); while (stream.incrementToken()) { String wordset = stream.getAttribute(CharTermAttribute.class).toString(); wordset = wordset.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", ""); result += wordset + " "; } result += "\n"; stream.close(); line = input.readLine(); } input.close(); return result; } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } }
From source file:com.cloudera.knittingboar.records.Test20NewsgroupsBookParsing.java
License:Apache License
/** * /*from w ww . j a v a2 s.c o m*/ * Counts words * * @param analyzer * @param words * @param in * @throws IOException */ private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { System.out.println("> ----- countWords ------"); // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); System.out.print(" " + s); words.add(s); } System.out.println("\n<"); /*overallCounts.addAll(words);*/ }
From source file:com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s);//www . j a v a 2s . com } }
From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java
License:Apache License
/** * //from w ww . ja v a 2 s . c om * Counts words * * @param analyzer * @param words * @param in * @throws IOException */ private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { //System.out.println( "> ----- countWords ------" ); // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); //System.out.print( " " + s ); words.add(s); } //System.out.println( "\n<" ); /*overallCounts.addAll(words);*/ }
From source file:com.cloudera.knittingboar.utils.DatasetConverter.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { // use the provided analyzer to tokenize the input stream TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); // System.out.print( " " + s ); words.add(s);//from ww w. ja v a 2s. co m } }
From source file:com.cloudera.knittingboar.utils.DatasetConverter.java
License:Apache License
public static String ReadFullFile(Analyzer analyzer, String newsgroup_name, String file) throws IOException { String out = newsgroup_name + "\t"; BufferedReader reader = null; // Collection<String> words Multiset<String> words = ConcurrentHashMultiset.create(); try {/*from ww w .ja va 2 s. co m*/ reader = new BufferedReader(new FileReader(file)); TokenStream ts = analyzer.tokenStream("text", reader); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to // collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); out += s + " "; } } finally { if (reader != null) { reader.close(); } } return out + "\n"; }
From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java
public void performWork(Path doc) throws IOException { try {// w ww .j av a2s. c o m System.out.println("performing token work"); HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder part = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); System.out.println(key); String value = pair.getSecond().toString(); part.append(key); TokenStream stream = analyzer.tokenStream(key, new StringReader(value)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); stream.close(); tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(vectorsDir, part.toString()); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); System.out.println("wrote"); } } catch (Exception e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.dhamacher.sentimentanalysis4tweets.sentiment.Tokenizer.java
License:Apache License
/** * Retrieve the tokens in a String. Behaves like getTokens, but operates on * a string instead of a tweet object./* w w w. j ava 2 s .c o m*/ * * @param text The text to tokenize. * @return The tokens in the text. */ // Version 1 /*public LinkedList<String> getTokens (String text) { LinkedList<String> tokens = new LinkedList(); String[] words = text.split(" "); tokens.addAll(Arrays.asList(words)); return tokens; }*/ // Version 2 public static LinkedList<String> getTokens(String text) throws IOException { LinkedList<String> tokens = new LinkedList(); TokenStream ts = new StandardTokenizer(new StringReader(text)); TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); while (ts.incrementToken()) { tokens.add(termAtt.term()); //System.out.print(termAtt.term()); } return tokens; }
From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException { String sContent = value.getText(); if (sContent == null) { // no text available? skip context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1); return;/*w w w . ja v a 2s . c o m*/ } TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringTuple document = new StringTuple(); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } context.write(key, document); }
From source file:com.digitalpebble.classification.example.TwentyNewsgroups.java
License:Apache License
private List<String> analyseField(String content) throws IOException { if (content == null) return null; List<String> tokens = new ArrayList<String>(); StringReader sr = new StringReader(content); TokenStream ts = analyzer.tokenStream("dummyValue", sr); TermAttribute term = ts.addAttribute(TermAttribute.class); while (ts.incrementToken()) { tokens.add(term.term());/* www . j av a 2s . com*/ } return tokens; }