List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:com.basistech.IndexFiles.java
License:Open Source License
private void iterateOverFiles(File directory) throws IOException { File[] textFiles = directory.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(".txt"); }//from w w w . j a va2 s .com }); for (File dataFile : textFiles) { Reader dataReader = null; try { dataReader = Files.newReader(dataFile, Charsets.UTF_8); TokenStream tokenStream = analyzer.tokenStream("full_text", dataReader); tokenStream.reset(); OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class); while (tokenStream.incrementToken()) { offsets.startOffset(); } } finally { IOUtils.closeQuietly(dataReader); } } }
From source file:com.billiger.solr.handler.component.QLTBComponent.java
License:Apache License
/** * Get analyzed version of the query string. * * This uses the analyzer for the configured FieldType for this * component to analyze and re-assemble the original query string. * If no queryFieldType is configured, the original query will be * returned.//w w w. j a v a2 s .co m * * This is used both in the prepare() stage of the component and * when reading the QLTB map data. */ String getAnalyzedQuery(String query) throws IOException { if (analyzer == null) { return query; } StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(query)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } tokens.end(); tokens.close(); return norm.toString(); }
From source file:com.bizosys.hsearch.inpipe.ComputeTokens.java
License:Apache License
private void tokenize(Doc doc, TermStream ts) throws SystemFault, ApplicationFault, IOException { if (null == ts) return;// w ww .jav a 2 s . co m TokenStream stream = ts.stream; if (null == stream) return; DocTerms terms = doc.terms; if (null == doc.terms) { terms = new DocTerms(); doc.terms = terms; } String token = null; int offset = 0; CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { token = termA.toString(); offset = offsetA.startOffset(); Term term = new Term(doc.tenant, token, ts.sighting, ts.type, offset); terms.getTermList().add(term); } stream.close(); }
From source file:com.bull.aurocontrol.csst.poc.index.interval.InNumericIntervalQuery.java
License:Apache License
/** * Creates a query to find intervals a number is in. * @param name The name of the field to search. * @param value The search value./*from w w w. ja va2 s. c o m*/ * @param precisionStep The precision step used when indexing the field. */ public InNumericIntervalQuery(final String name, final long value, final int precisionStep) { super(true); this.value = value; TokenStream stream = new NumericTokenStream(precisionStep).setLongValue(value); try { stream.reset(); while (stream.incrementToken()) { this.add(new TermQuery(new Term(name, stream.getAttribute(TermAttribute.class).term())), BooleanClause.Occur.SHOULD); } } catch (IOException e) { throw new IllegalStateException("This should never happen - NumericTokenStream does no IO."); } }
From source file:com.chimpler.example.bayes.Classifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;/*w ww.jav a 2 s.c om*/ } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:com.chriscx.stem.Stem.java
public String evaluate(BufferedReader input) { if (input == null) { return null; }//from w w w .ja va 2 s . co m CharArraySet stopWordsSet = new CharArraySet(Version.LUCENE_46, 10000, true); String stopWords = "a afin ai ainsi aprs attendu au aujourd auquel aussi " + "autre autres aux auxquelles auxquels avait avant avec c car ce " + "ceci cela celle celles celui cependant certain certaine certaines " + "certains ces cet cette ceux chez ci combien comme comment " + "concernant contre d dans de debout dedans dehors del depuis " + "derrire des dsormais desquelles desquels devers devra doit " + "donc dont du duquel durant ds elle elles en entre environ est" + " et etc eu eux except hormis hors hlas hui il ils j je jusqu " + "jusque l la laquelle le lequel les lesquelles lesquels leur leurs " + "lorsque lui l ma mais malgr me merci mes mien mienne miennes " + "miens moins mon moyennant mme mmes n ne ni non nos notre nous " + "nanmoins ntre ntres on ont ou outre o par parmi partant pas " + "pass pendant plein plus plusieurs pour pourquoi proche prs " + "puisque qu quand que quel quelle quelles quels qui quoi quoique" + " revoici revoil s sa sans sauf se selon seront ses si sien " + "sienne siennes siens sinon soi soit son sont sous suivant sur " + "ta te tes tien tienne tiennes tiens ton tous tout toute toutes" + " tu un une va vers voici voil vos votre vous vu vtre vtres y " + " t tre "; String[] stopWordsTab = stopWords.split(" "); for (String word : stopWordsTab) { stopWordsSet.add(word); } Analyzer analyzer = new FrenchAnalyzer(Version.LUCENE_46, stopWordsSet); result = ""; try { String line = input.readLine(); line = line.replaceAll("(\\S)+@(\\S)+.(\\S)+", ""); line = line.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", ""); line = line.replaceAll("(_|-)+", ""); line = line.replaceAll("(\\n|\\r|\\t)+", ""); line = line.replaceAll("(?![\\._])\\p{P}", ""); while (line != null) { TokenStream stream = analyzer.tokenStream(null, line); stream.reset(); while (stream.incrementToken()) { String wordset = stream.getAttribute(CharTermAttribute.class).toString(); wordset = wordset.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", ""); result += wordset + " "; } result += "\n"; stream.close(); line = input.readLine(); } input.close(); return result; } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } }
From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java
public void performWork(Path doc) throws IOException { try {//from ww w . ja v a 2 s. c o m System.out.println("performing token work"); HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder part = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); System.out.println(key); String value = pair.getSecond().toString(); part.append(key); TokenStream stream = analyzer.tokenStream(key, new StringReader(value)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); stream.close(); tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(vectorsDir, part.toString()); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); System.out.println("wrote"); } } catch (Exception e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException { String sContent = value.getText(); if (sContent == null) { // no text available? skip context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1); return;/*from w ww . j a v a 2 s. c o m*/ } TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringTuple document = new StringTuple(); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } context.write(key, document); }
From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java
License:Open Source License
public static String analyze(String str, Analyzer analyzer) throws IOException { if (analyzer == null) { return str; }//from w w w.jav a 2s . c om StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(str)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } return norm.toString(); }
From source file:com.doculibre.constellio.utils.AnalyzerUtils.java
License:Open Source License
public static String analyzePhrase(String phrase, boolean useStopWords) { if (StringUtils.isNotBlank(phrase)) { String analysedPhrase;// w w w. j a v a2s .com Analyzer analyzer = getDefaultAnalyzer(useStopWords); StringBuilder norm = new StringBuilder(); TokenStream tokens; try { tokens = analyzer.tokenStream("", new StringReader(phrase)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } analysedPhrase = norm.toString().trim(); } catch (IOException e) { throw new RuntimeException(e); } return analysedPhrase; } else { return phrase; } }