List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:edu.stanford.rad.naivebayes.ClassifyLines.java
License:Apache License
public static void main(String[] args) throws Exception { // if (args.length < 5) { // System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); // return; // }//from w w w . j av a 2 s. co m // String modelPath = args[0]; // String labelIndexPath = args[1]; // String dictionaryPath = args[2]; // String documentFrequencyPath = args[3]; // String tweetsPath = args[4]; String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb"; String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex"; String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0"; String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000"; String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt"; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java
License:Apache License
@Override public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim) throws IOException { TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val); try {/*from w w w . j a va2s. co m*/ ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class); String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE; while (ts.incrementToken()) { if (matchType.equals(typeAtt.type())) { BytesRefBuilder ret = new BytesRefBuilder(); ret.copyChars(termAtt.toString()); if (!strict || appendExtraDelim) { ret.append(delimBytes, 0, delimBytes.length); } return ret.get(); } } return new BytesRef(BytesRef.EMPTY_BYTES); } finally { ts.close(); } }
From source file:edu.utsa.sifter.DocMaker.java
License:Apache License
public static boolean addBodyField(final Document doc, final String body, final Analyzer analyzer, boolean testEmpty) throws IOException { final Field f = new Field("body", body, BodyOptions); if (testEmpty) { // System.out.println("testing if doc has empty body"); final TokenStream toks = f.tokenStream(analyzer); toks.reset();//from w ww .ja v a 2 s . c o m if (!toks.incrementToken()) { // System.out.println("empty body, won't index"); toks.close(); return false; } } doc.add(new Field("body", body, BodyOptions)); doc.add(new LongField("body-len", body.length(), Field.Store.YES)); return true; }
From source file:edu.virginia.cs.utility.StringTokenizer.java
/** * Method that generates list of tokens from the parameter string. * * @param string/*from ww w . ja v a 2s .c o m*/ * @return list of tokens generated */ public List<String> TokenizeString(String string) { List<String> result = new ArrayList<>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:elhuyar.bilakit.PayloadQParserPlugin.java
License:Open Source License
@Override protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError { SchemaField sf = this.schema.getFieldOrNull(field); if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) { //analyze queryText List<String> result = new ArrayList<String>(); try {//from w ww . j av a 2 s. c om TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } String analyzedqueryText = ""; analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " "); queryText = analyzedqueryText; // Note that this will work for any field defined with the // <fieldType> of "*_payloads" Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true); return plter; } return super.getFieldQuery(field, queryText, quoted); }
From source file:engine.easy.indexer.writer.EasySearchIndexWriter.java
License:Apache License
/** * Read the extra data field information * // w w w .j av a2 s .c om * @return it returns the no: of token streams for the extra data field information. * @throws IOException if the file would have any IO operation. */ private int[] extraData(Field field, Analyzer analyzer) throws IOException { if (!field.isIndexed()) return null; if (!field.isTokenized()) return (new int[] { 1, 1 }); String strv = field.stringValue(); int v[]; if (strv == null) { Reader readerv = field.readerValue(); if (readerv == null) { TokenStream tsv = field.tokenStreamValue(); if (tsv == null) { throw new IllegalArgumentException( (new StringBuilder("Cannot obtain field value. field_name: ")).append(field.name()) .append(".").toString()); } else { v = countTokenStream(tsv); return v; } } strv = readAll(readerv); if (strv == null) throw new IllegalArgumentException((new StringBuilder("Cannot obtain field value. field_name: ")) .append(field.name()).append(".").toString()); field.setValue(strv); } BufferedReader reader = new BufferedReader(new StringReader(strv)); TokenStream ts = analyzer.tokenStream(field.name(), reader); v = countTokenStream(ts); ts.close(); reader.close(); return v; }
From source file:ikanalyzer.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); // ?LuceneTokenStream TokenStream ts = null; try {//from ww w.jav a2 s.com ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:indexer.LineDocumentIndexer.java
Document constructDoc(FileWriter fw, String id, String line) throws Exception { Document doc = new Document(); doc.add(new Field(DocVector.FIELD_ID, id, Field.Store.YES, Field.Index.NOT_ANALYZED)); StringBuffer tokenizedContentBuff = new StringBuffer(); TokenStream stream = analyzer.tokenStream(FIELD_WORDS, new StringReader(line)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//w w w . j a v a 2 s. co m while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); tokenizedContentBuff.append("\n"); fw.write(id + "\t" + tokenizedContentBuff.toString()); // Reanalyze doc.add(new Field(FIELD_WORDS, line, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); return doc; }
From source file:indexer.Paragraph.java
List<Paragraph> constructParagraphs(int docId, String content) throws Exception { List<Paragraph> parList = new ArrayList<>(); List<String> tokens = new ArrayList<>(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/*from www.ja v a 2 s. c om*/ int count = 0; int id = 0; while (stream.incrementToken()) { String term = termAtt.toString(); tokens.add(term); count++; if (count == paraWindowSize) { // create a paragraph Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens); tokens.clear(); count = 0; parList.add(p); } } if (count > 0) { Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens); parList.add(p); } stream.end(); stream.close(); return parList; }
From source file:indexer.WordVecSequenceFileGenerator.java
String embedWords(Document d) throws Exception { String content = d.get(AMI_FIELDS.FIELD_CONTENT); int decScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_DECISION_SCORE)) > 0 ? 1 : 0; int prefScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_PREF_SCORE)) > 0 ? 1 : 0; List<String> tokens = new ArrayList<>(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/*from www.ja v a 2s . c o m*/ StringBuffer buff = new StringBuffer(); boolean labelsStoredWithWords = Boolean.parseBoolean(prop.getProperty("word.labels", "false")); while (stream.incrementToken()) { String term = termAtt.toString().toLowerCase(); String[] wordAndLabel = null; if (labelsStoredWithWords) { wordAndLabel = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM); term = wordAndLabel[0]; // the first part is the word decScore = Integer.parseInt(wordAndLabel[1]); prefScore = Integer.parseInt(wordAndLabel[2]); } double[] x = wvecs.getWordVector(term); if (x == null) { System.err.println("No vec found for word " + term); continue; } String wvec = vecToStr(x); if (decScore > 1) decScore = 1; if (prefScore > 1) prefScore = 1; buff.append(wvec).append("\t").append(decScore).append("\t").append(prefScore).append("\n"); } stream.close(); return buff.toString(); }