List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:LogAnalyzerTest.java
License:Open Source License
public void assertAnalyzesTo(String input, String[] output) throws Exception { System.out.println(input);//w w w. j a v a 2s. c o m AnalyzerUtils.displayTokensWithFullDetails(analyzer, input); TokenStream stream = analyzer.tokenStream("field", new StringReader(input)); TermAttribute termAttr = stream.addAttribute(TermAttribute.class); for (String expected : output) { Assert.assertTrue(stream.incrementToken()); Assert.assertEquals(expected, termAttr.term()); } Assert.assertFalse(stream.incrementToken()); stream.close(); }
From source file:ClassifierHD.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(/*from w w w. j av a 2 s . c o m*/ "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; String inputDir = args[5]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection conn = null; PreparedStatement pstmt = null; try { Class.forName("org.postgresql.Driver"); conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); conn.setAutoCommit(false); String sql = "INSERT INTO " + tablename + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);"; pstmt = conn.prepareStatement(sql); FileSystem fs = FileSystem.get(configuration); FileStatus[] status = fs.listStatus(new Path(inputDir)); BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true))); for (int i = 0; i < status.length; i++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath()))); if (new String(status[i].getPath().getName()).equals("rep.list")) { continue; } int lv_HEAD = 1; int lv_cnt = 0; String lv_gtime = null; String lv_wtime = null; String lv_target = null; BigDecimal lv_num = null; String lv_link = null; String[] lv_args; String lv_line; StringBuilder lv_txt = new StringBuilder(); while ((lv_line = br.readLine()) != null) { if (lv_cnt < lv_HEAD) { lv_args = lv_line.split(","); lv_gtime = lv_args[0]; lv_wtime = lv_args[1]; lv_target = lv_args[2]; lv_num = new BigDecimal(lv_args[3]); lv_link = lv_args[4]; } else { lv_txt.append(lv_line + '\n'); } lv_cnt++; } br.close(); String id = status[i].getPath().getName(); String message = lv_txt.toString(); Multiset<String> words = ConcurrentHashMultiset.create(); TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); if (wordId != null) { words.add(word); wordCount++; } } } ts.end(); ts.close(); Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } //System.out.println(message); //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId)); pstmt.setString(1, id); pstmt.setString(2, lv_gtime); pstmt.setString(3, lv_wtime); pstmt.setString(4, lv_target); pstmt.setBigDecimal(5, lv_num); pstmt.setString(6, lv_link); pstmt.setString(7, message.substring(1, Math.min(50, message.length()))); pstmt.setString(8, labels.get(bestCategoryId)); pstmt.addBatch(); bw.write(id + "\t" + labels.get(bestCategoryId) + "\n"); } pstmt.executeBatch(); //pstmt.clearParameters(); pstmt.close(); conn.commit(); conn.close(); bw.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } analyzer.close(); }
From source file:PostgresClassifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(//from ww w .j a v a 2 s.co m "Arguments: [model] [label index] [dictionnary] [document frequency] [input postgres table]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection c = null; Statement stmt = null; Statement stmtU = null; try { Class.forName("org.postgresql.Driver"); c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); c.setAutoCommit(false); System.out.println("Opened database successfully"); stmt = c.createStatement(); stmtU = c.createStatement(); ResultSet rs = stmt.executeQuery("SELECT * FROM " + tablename + " WHERE rep is null"); while (rs.next()) { String seq = rs.getString("seq"); //String rep = rs.getString("rep"); String body = rs.getString("body"); //String category = rep; String id = seq; String message = body; //System.out.println("Doc: " + id + "\t" + message); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Mark : Modified ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } //System.out.print(" " + labels.get(categoryId) + ": " + score); } //System.out.println(" => " + labels.get(bestCategoryId)); //System.out.println("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id ); stmtU.executeUpdate("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id); } rs.close(); stmt.close(); stmtU.close(); c.commit(); c.close(); analyzer.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } }
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
/** * Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input * * @param tokenizedText the tokenized content of a field * @return a {@code String} array of the resulting tokens * @throws java.io.IOException If tokenization fails because there is a low-level I/O error *//*w w w. ja v a 2 s .c om*/ protected String[] getTokenArray(TokenStream tokenizedText) throws IOException { Collection<String> tokens = new LinkedList<>(); CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class); tokenizedText.reset(); while (tokenizedText.incrementToken()) { tokens.add(charTermAttribute.toString()); } tokenizedText.end(); tokenizedText.close(); return tokens.toArray(new String[tokens.size()]); }
From source file:analysis.AnalyzerUtils.java
License:Apache License
public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception { TokenStream stream = analyzer.tokenStream("field", new StringReader(input)); TermAttribute termAttr = stream.addAttribute(TermAttribute.class); for (String expected : output) { Assert.assertTrue(stream.incrementToken()); Assert.assertEquals(expected, termAttr.term()); }/* w ww . j av a 2 s.c o m*/ Assert.assertFalse(stream.incrementToken()); stream.close(); }
From source file:analyzers.DebugAnalyzer.java
License:Apache License
/** * This method outputs token-by-token analysis of documents. * * @param reader the reader for the documents * @param analyzer the analyzer //from w ww .j a v a 2 s . co m * @throws IOException cannot load stream */ public static void showAnalysisFromStream(Reader reader, Analyzer analyzer) throws IOException { TokenStream stream = analyzer.tokenStream("text", reader); CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class); OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); try { stream.reset(); while (stream.incrementToken()) { // get starting and ending offsets int start = oa.startOffset(); int end = oa.endOffset(); // text of the token String token = cta.toString(); // part of speech tag for the token String tag = typeAtt.type(); System.out.printf("start: %4d\tend: %4d\tlength: %4d\ttag: %s\ttoken: %s\n", start, end, token.length(), tag, token); } } finally { stream.close(); } }
From source file:aos.lucene.analysis.AnalyzerUtils.java
License:Apache License
public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception { TokenStream stream = analyzer.tokenStream("field", new StringReader(input)); TermAttribute termAttr = stream.addAttribute(TermAttribute.class); for (String expected : output) { Assert.assertTrue(stream.incrementToken()); Assert.assertEquals(expected, termAttr.term()); }// w ww. j a v a 2s . c o m Assert.assertFalse(stream.incrementToken()); stream.close(); }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java
License:Apache License
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { TokenStream ts = analyzer.tokenStream("", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); // PositionIncrementAttribute posIncAtt = // ts.addAttribute(PositionIncrementAttribute.class); ts.reset();//w ww. j av a 2s .c om reuse.length = 0; while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } // if (posIncAtt.getPositionIncrement() != 1) { // throw new IllegalArgumentException("term: " + text + // " analyzed to a token with posinc != 1"); // } reuse.grow(reuse.length + length + 1); /* current + word + separator */ int end = reuse.offset + reuse.length; if (reuse.length > 0) { reuse.chars[end++] = 32; // space reuse.length++; } System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); reuse.length += length; } ts.end(); ts.close(); if (reuse.length == 0) { throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); } return reuse; }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java
License:Apache License
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { TokenStream ts = analyzer.tokenStream("", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); // PositionIncrementAttribute posIncAtt = // ts.addAttribute(PositionIncrementAttribute.class); boolean phraseTerm = false; ts.reset();//from ww w . ja va 2 s .c o m reuse.length = 0; while (ts.incrementToken()) { // System.out.println(text + " | " + termAtt.toString()); int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } // if (posIncAtt.getPositionIncrement() != 1) { // throw new IllegalArgumentException("term: " + text + // " analyzed to a token with posinc != 1"); // } reuse.grow(reuse.length + length + 1); /* * current + word + * separator */ int end = reuse.offset + reuse.length; if (reuse.length > 0) { reuse.chars[end++] = 32; // space reuse.length++; phraseTerm = true; } System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); reuse.length += length; } ts.end(); ts.close(); if (reuse.length == 0) { throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); } if (phraseTerm) { reuse.grow(reuse.length + 2); /* current + word + separator */ reuse.length += 2; char next = reuse.chars[0]; for (int i = 0; i < reuse.length - 2; i++) { char tmp = reuse.chars[i + 1]; reuse.chars[i + 1] = next; next = tmp; } reuse.chars[0] = '\"'; reuse.chars[reuse.length - 1] = '\"'; } return reuse; }
From source file:at.ac.univie.mminf.luceneSKOS.queryparser.flexible.standard.processors.SKOSQueryNodeProcessor.java
License:Apache License
@Override protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException { if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof RegexpQueryNode) && !(node.getParent() instanceof RangeQueryNode)) { FieldQueryNode fieldNode = ((FieldQueryNode) node); String text = fieldNode.getTextAsString(); String field = fieldNode.getFieldAsString(); TokenStream source; try {//from w ww.j a v a2 s. c o m source = this.analyzer.tokenStream(field, text); source.reset(); } catch (IOException e1) { throw new RuntimeException(e1); } CachingTokenFilter buffer = new CachingTokenFilter(source); PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; boolean severalTokensAtSamePosition = false; if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } try { while (buffer.incrementToken()) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } } } catch (IOException e) { // ignore } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { // ignore } if (!buffer.hasAttribute(CharTermAttribute.class)) { return new NoTokenFoundQueryNode(); } CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class); if (numTokens == 0) { return new NoTokenFoundQueryNode(); } else if (numTokens == 1) { String term = null; try { boolean hasNext; hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } fieldNode.setText(term); return fieldNode; } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) { if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) { // no phrase query: LinkedList<QueryNode> children = new LinkedList<QueryNode>(); for (int i = 0; i < numTokens; i++) { String term = null; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (buffer.hasAttribute(SKOSTypeAttribute.class) && boosts != null) { SKOSTypeAttribute skosAttr = buffer.getAttribute(SKOSTypeAttribute.class); children.add(new BoostQueryNode(new FieldQueryNode(field, term, -1, -1), getBoost(skosAttr.getSkosType()))); } else { children.add(new FieldQueryNode(field, term, -1, -1)); } } return new GroupQueryNode(new StandardBooleanQueryNode(children, positionCount == 1)); } else { // phrase query: MultiPhraseQueryNode mpq = new MultiPhraseQueryNode(); List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>(); int position = -1; int i = 0; int termGroupCount = 0; for (; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { for (FieldQueryNode termNode : multiTerms) { if (this.positionIncrementsEnabled) { termNode.setPositionIncrement(position); } else { termNode.setPositionIncrement(termGroupCount); } mpq.add(termNode); } // Only increment once for each "group" of // terms that were in the same position: termGroupCount++; multiTerms.clear(); } position += positionIncrement; multiTerms.add(new FieldQueryNode(field, term, -1, -1)); } for (FieldQueryNode termNode : multiTerms) { if (this.positionIncrementsEnabled) { termNode.setPositionIncrement(position); } else { termNode.setPositionIncrement(termGroupCount); } mpq.add(termNode); } return mpq; } } else { TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); if (this.positionIncrementsEnabled) { position += positionIncrement; newFieldNode.setPositionIncrement(position); } else { newFieldNode.setPositionIncrement(i); } pq.add(newFieldNode); } return pq; } } return node; }