List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:ClassifierHD.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(// w w w . j ava 2s.com "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; String inputDir = args[5]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection conn = null; PreparedStatement pstmt = null; try { Class.forName("org.postgresql.Driver"); conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); conn.setAutoCommit(false); String sql = "INSERT INTO " + tablename + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);"; pstmt = conn.prepareStatement(sql); FileSystem fs = FileSystem.get(configuration); FileStatus[] status = fs.listStatus(new Path(inputDir)); BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true))); for (int i = 0; i < status.length; i++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath()))); if (new String(status[i].getPath().getName()).equals("rep.list")) { continue; } int lv_HEAD = 1; int lv_cnt = 0; String lv_gtime = null; String lv_wtime = null; String lv_target = null; BigDecimal lv_num = null; String lv_link = null; String[] lv_args; String lv_line; StringBuilder lv_txt = new StringBuilder(); while ((lv_line = br.readLine()) != null) { if (lv_cnt < lv_HEAD) { lv_args = lv_line.split(","); lv_gtime = lv_args[0]; lv_wtime = lv_args[1]; lv_target = lv_args[2]; lv_num = new BigDecimal(lv_args[3]); lv_link = lv_args[4]; } else { lv_txt.append(lv_line + '\n'); } lv_cnt++; } br.close(); String id = status[i].getPath().getName(); String message = lv_txt.toString(); Multiset<String> words = ConcurrentHashMultiset.create(); TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); if (wordId != null) { words.add(word); wordCount++; } } } ts.end(); ts.close(); Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } //System.out.println(message); //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId)); pstmt.setString(1, id); pstmt.setString(2, lv_gtime); pstmt.setString(3, lv_wtime); pstmt.setString(4, lv_target); pstmt.setBigDecimal(5, lv_num); pstmt.setString(6, lv_link); pstmt.setString(7, message.substring(1, Math.min(50, message.length()))); pstmt.setString(8, labels.get(bestCategoryId)); pstmt.addBatch(); bw.write(id + "\t" + labels.get(bestCategoryId) + "\n"); } pstmt.executeBatch(); //pstmt.clearParameters(); pstmt.close(); conn.commit(); conn.close(); bw.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } analyzer.close(); }
From source file:PostgresClassifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(/*from w w w .j a v a 2 s . com*/ "Arguments: [model] [label index] [dictionnary] [document frequency] [input postgres table]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection c = null; Statement stmt = null; Statement stmtU = null; try { Class.forName("org.postgresql.Driver"); c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); c.setAutoCommit(false); System.out.println("Opened database successfully"); stmt = c.createStatement(); stmtU = c.createStatement(); ResultSet rs = stmt.executeQuery("SELECT * FROM " + tablename + " WHERE rep is null"); while (rs.next()) { String seq = rs.getString("seq"); //String rep = rs.getString("rep"); String body = rs.getString("body"); //String category = rep; String id = seq; String message = body; //System.out.println("Doc: " + id + "\t" + message); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Mark : Modified ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } //System.out.print(" " + labels.get(categoryId) + ": " + score); } //System.out.println(" => " + labels.get(bestCategoryId)); //System.out.println("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id ); stmtU.executeUpdate("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id); } rs.close(); stmt.close(); stmtU.close(); c.commit(); c.close(); analyzer.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } }
From source file:analysis.FtpFilePathAnalyzer.java
License:Apache License
public static void main(String[] args) { Analyzer ana = new FtpFilePathAnalyzer(); String test2 = "c++c++"; StringReader reader = new StringReader(test2); TokenStream ts = ana.tokenStream("path", reader); try {/* w w w.ja v a 2 s . c o m*/ while (ts.incrementToken()) { TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts .getAttribute(PositionIncrementAttribute.class); TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); System.out.print("(" + offsetAtt.startOffset() + "," + offsetAtt.endOffset() + ") [" + posIncrAtt.getPositionIncrement() + "," + typeAtt.type() + "] " + "[" + termAtt.term() + "]"); } } catch (IOException e) { e.printStackTrace(); } }
From source file:at.ac.tuwien.ifs.myluceneanalyzers.fa.algorithm.PersianDictionaryCountCompoundWord.java
@SuppressWarnings({ "resource", "deprecation" }) private String stem(String input) throws IOException { String output = ""; Reader reader = new StringReader(input); Tokenizer source = new StandardTokenizer(Version.LUCENE_4_10_3, reader); TokenStream tokenStream = new PersianStemFilter(source); CharTermAttribute charTermAttributeGreedy = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset();//from w w w .j a v a 2 s. com while (tokenStream.incrementToken()) { output = output + " " + charTermAttributeGreedy.toString(); } return output.trim(); }
From source file:at.newmedialab.lmf.util.solr.suggestion.service.FieldAnalyzerService.java
License:Apache License
/** * analyzes string like the default field * @param df the name of the default field * @param s the string to analyze/*from ww w. ja v a2 s . c om*/ * @return */ public static String analyzeString(SolrCore core, String df, String s) { try { TokenStream ts = core.getSchema().getFieldType(df).getQueryAnalyzer().tokenStream(df, new StringReader(s)); StringBuffer b = new StringBuffer(); ts.reset(); while (ts.incrementToken()) { b.append(" "); CharTermAttribute attr = ts.getAttribute(CharTermAttribute.class); b.append(attr); } return b.toString().trim(); } catch (IOException e) { e.printStackTrace(); return s; } }
From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java
License:Apache License
private void updateWeightHighlighting() { // remove previous highlighting removeHighLights(weightingHighLights); if (weightHighlightBox.isSelected()) { if (inputDataObjects.getTemplateVector() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Template vector file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; }//from w ww .j a v a 2s . co m if (inputDataObjects.getInputData() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Input data file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; } SOMLibTemplateVector tv = inputDataObjects.getTemplateVector(); InputData data = inputDataObjects.getInputData(); InputDatum input = data.getInputDatum(currentInput); double maxValue = data.getMaxValue(); double minValue = data.getMinValue(); double span = maxValue - minValue; // init paints Palette p = paletteSelectionPanel.getSelectedPalette(); int paletteLength = p.getNumberOfColours(); weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength]; for (int i = 0; i < weightPaints.length; i++) { weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i)); } String text = textPane.getText(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); try { while (stream.incrementToken()) { TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class); if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(), "<APOSTROPHE>")) { TermAttribute termAttribute = stream.getAttribute(TermAttribute.class); String term = termAttribute.term(); if (tv.containsLabel(term)) { int index = tv.getIndex(term); double value = input.getVector().getQuick(index); int colorIndex = (int) (paletteLength / 4d + relativeValue(minValue, span, value) * paletteLength / 2d); OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class); offsetAttribute.startOffset(); Object tag = highlighter.addHighlight(offsetAttribute.startOffset(), offsetAttribute.endOffset(), weightPaints[colorIndex]); weightingHighLights.add(tag); } } } } catch (IOException e) { e.printStackTrace(); } catch (BadLocationException e) { e.printStackTrace(); } } }
From source file:br.edu.utfpr.cm.JGitMinerWeb.services.matrix.auxiliary.LuceneUtil.java
public static List<String> tokenizeString(String linha) { Analyzer analyzer = new StopAnalyzer(Version.LUCENE_46); List<String> result = new ArrayList<>(); try {/*from w w w . j a va2s .co m*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(linha)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } return result; }
From source file:br.edu.utfpr.cm.JGitMinerWeb.util.LuceneUtil.java
public static List<String> tokenizeString(String linha) { Analyzer analyzer = new StopAnalyzer(); List<String> result = new ArrayList<>(); try {//from w w w .j a v a2 s . c om TokenStream stream = analyzer.tokenStream(null, new StringReader(linha)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } return result; }
From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java
public List<String> search(String field, String value) { try {//from w w w .j av a 2 s .c o m long start = System.currentTimeMillis(); TokenStream stream = analyzer.tokenStream(field, new StringReader(value)); CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class); stream.reset(); String valor = ""; while (stream.incrementToken()) { valor = valor + attr.toString() + ' '; } BooleanQuery bq = new BooleanQuery(); BooleanQuery acronymBq = null; String query = ""; BooleanQuery wrapBq = new BooleanQuery(); String[] tokens = valor.split(" "); for (int i = 0; i < tokens.length; i++) { if (tokens.length >= 2) { acronymBq = new BooleanQuery(); switch (i) { case 0: acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); break; case 1: acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT); bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT); break; default: break; } } else { if (tokens[i].length() > 3) { bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } else { bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } } } stream.end(); stream.close(); // Aqui termina // Cria uma fuzzyquery, ela que far a busca de aproximao wrapBq.add(bq, BooleanClause.Occur.MUST); if (acronymBq != null) { //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query) wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT); } String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms"; // Pegando os documentos encontrado na pesquisa start = System.currentTimeMillis(); ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs; String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms"; List<String> result = new ArrayList<String>(); result.add(valor); if (hits.length > 0) { for (int i = 0; i < hits.length; i++) { Document hitDoc = searcher.doc(hits[i].doc); result.add(hitDoc.get(field)); } } result.add(queryTime); result.add(searchTime); return result; } catch (IOException ex) { Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:byrne.mitre.MitreQuery.java
License:Apache License
public void run() { try {// w w w . j av a 2 s . c o m TokenStream tokenStream = analyzer.tokenStream("ngrams", new StringReader(entry.getFullName())); BooleanQuery bq = new BooleanQuery(); while (tokenStream.incrementToken()) { Term t = new Term("ngrams", tokenStream.getAttribute(TermAttribute.class).term()); bq.add(new TermQuery(t), BooleanClause.Occur.SHOULD); } TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(bq, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); out.write(entry.getID() + "|" + d.get("id") + "|" + df.format(hits[i].score) + "\n"); } } catch (IOException IOE) { } }