List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:ClassifierHD.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(//w w w.j a v a 2 s . co m "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; String inputDir = args[5]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection conn = null; PreparedStatement pstmt = null; try { Class.forName("org.postgresql.Driver"); conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); conn.setAutoCommit(false); String sql = "INSERT INTO " + tablename + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);"; pstmt = conn.prepareStatement(sql); FileSystem fs = FileSystem.get(configuration); FileStatus[] status = fs.listStatus(new Path(inputDir)); BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true))); for (int i = 0; i < status.length; i++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath()))); if (new String(status[i].getPath().getName()).equals("rep.list")) { continue; } int lv_HEAD = 1; int lv_cnt = 0; String lv_gtime = null; String lv_wtime = null; String lv_target = null; BigDecimal lv_num = null; String lv_link = null; String[] lv_args; String lv_line; StringBuilder lv_txt = new StringBuilder(); while ((lv_line = br.readLine()) != null) { if (lv_cnt < lv_HEAD) { lv_args = lv_line.split(","); lv_gtime = lv_args[0]; lv_wtime = lv_args[1]; lv_target = lv_args[2]; lv_num = new BigDecimal(lv_args[3]); lv_link = lv_args[4]; } else { lv_txt.append(lv_line + '\n'); } lv_cnt++; } br.close(); String id = status[i].getPath().getName(); String message = lv_txt.toString(); Multiset<String> words = ConcurrentHashMultiset.create(); TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); if (wordId != null) { words.add(word); wordCount++; } } } ts.end(); ts.close(); Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } //System.out.println(message); //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId)); pstmt.setString(1, id); pstmt.setString(2, lv_gtime); pstmt.setString(3, lv_wtime); pstmt.setString(4, lv_target); pstmt.setBigDecimal(5, lv_num); pstmt.setString(6, lv_link); pstmt.setString(7, message.substring(1, Math.min(50, message.length()))); pstmt.setString(8, labels.get(bestCategoryId)); pstmt.addBatch(); bw.write(id + "\t" + labels.get(bestCategoryId) + "\n"); } pstmt.executeBatch(); //pstmt.clearParameters(); pstmt.close(); conn.commit(); conn.close(); bw.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } analyzer.close(); }
From source file:PostgresClassifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(//ww w . j ava 2 s .com "Arguments: [model] [label index] [dictionnary] [document frequency] [input postgres table]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection c = null; Statement stmt = null; Statement stmtU = null; try { Class.forName("org.postgresql.Driver"); c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); c.setAutoCommit(false); System.out.println("Opened database successfully"); stmt = c.createStatement(); stmtU = c.createStatement(); ResultSet rs = stmt.executeQuery("SELECT * FROM " + tablename + " WHERE rep is null"); while (rs.next()) { String seq = rs.getString("seq"); //String rep = rs.getString("rep"); String body = rs.getString("body"); //String category = rep; String id = seq; String message = body; //System.out.println("Doc: " + id + "\t" + message); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Mark : Modified ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } //System.out.print(" " + labels.get(categoryId) + ": " + score); } //System.out.println(" => " + labels.get(bestCategoryId)); //System.out.println("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id ); stmtU.executeUpdate("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id); } rs.close(); stmt.close(); stmtU.close(); c.commit(); c.close(); analyzer.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } }
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
/** * Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input * * @param tokenizedText the tokenized content of a field * @return a {@code String} array of the resulting tokens * @throws java.io.IOException If tokenization fails because there is a low-level I/O error *//*from w ww .ja v a 2s .co m*/ protected String[] getTokenArray(TokenStream tokenizedText) throws IOException { Collection<String> tokens = new LinkedList<>(); CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class); tokenizedText.reset(); while (tokenizedText.incrementToken()) { tokens.add(charTermAttribute.toString()); } tokenizedText.end(); tokenizedText.close(); return tokens.toArray(new String[tokens.size()]); }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java
License:Apache License
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { TokenStream ts = analyzer.tokenStream("", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); // PositionIncrementAttribute posIncAtt = // ts.addAttribute(PositionIncrementAttribute.class); ts.reset();// w w w . jav a 2 s .c o m reuse.length = 0; while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } // if (posIncAtt.getPositionIncrement() != 1) { // throw new IllegalArgumentException("term: " + text + // " analyzed to a token with posinc != 1"); // } reuse.grow(reuse.length + length + 1); /* current + word + separator */ int end = reuse.offset + reuse.length; if (reuse.length > 0) { reuse.chars[end++] = 32; // space reuse.length++; } System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); reuse.length += length; } ts.end(); ts.close(); if (reuse.length == 0) { throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); } return reuse; }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java
License:Apache License
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { TokenStream ts = analyzer.tokenStream("", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); // PositionIncrementAttribute posIncAtt = // ts.addAttribute(PositionIncrementAttribute.class); boolean phraseTerm = false; ts.reset();/*from w w w . j a va2 s . c o m*/ reuse.length = 0; while (ts.incrementToken()) { // System.out.println(text + " | " + termAtt.toString()); int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } // if (posIncAtt.getPositionIncrement() != 1) { // throw new IllegalArgumentException("term: " + text + // " analyzed to a token with posinc != 1"); // } reuse.grow(reuse.length + length + 1); /* * current + word + * separator */ int end = reuse.offset + reuse.length; if (reuse.length > 0) { reuse.chars[end++] = 32; // space reuse.length++; phraseTerm = true; } System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); reuse.length += length; } ts.end(); ts.close(); if (reuse.length == 0) { throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); } if (phraseTerm) { reuse.grow(reuse.length + 2); /* current + word + separator */ reuse.length += 2; char next = reuse.chars[0]; for (int i = 0; i < reuse.length - 2; i++) { char tmp = reuse.chars[i + 1]; reuse.chars[i + 1] = next; next = tmp; } reuse.chars[0] = '\"'; reuse.chars[reuse.length - 1] = '\"'; } return reuse; }
From source file:at.itbh.bev.apibeans.FinderImpl.java
License:Open Source License
public FullTextQuery constructQuery(EntityManager em, String postalCode, String place, String addressLine, String houseId) throws InvalidApiUsageException { FullTextEntityManager fullTextEm = Search.getFullTextEntityManager(em); if ((Objects.toString(postalCode, "") + Objects.toString(place, "") + Objects.toString(addressLine, "") + Objects.toString(houseId, "")).length() == 0) { throw new InvalidApiUsageException( "At least one parameter must be provided. Coordinates don't count as parameters."); }//from w w w . ja v a 2s.c om if (addressLine != null && addressLine.length() < 2 && addressLine.length() > 0) { throw new InvalidApiUsageException("The parameter addressLine must consist of at least 2 characters."); } QueryBuilder b = fullTextEm.getSearchFactory().buildQueryBuilder().forEntity(AdresseDenormalized.class) .get(); List<Query> queries = new ArrayList<>(); if (postalCode != null && postalCode.length() > 0) { queries.add(b.keyword().onField("postalCode").boostedTo(20).matching(postalCode).createQuery()); } if (addressLine != null && addressLine.length() > 0) { queries.add(b.keyword().onField("addressLine").matching(addressLine + addressLine + addressLine) .createQuery()); // triple addressLine since in the data source it is also tripled if // there is no building or address name queries.add(b.keyword().onField("addressLineExact").boostedTo(10) .matching(addressLine + addressLine + addressLine).createQuery()); } if (houseId != null && houseId.length() > 0) { // if search string contains a number, take the first number in the // search string and match with the house number Matcher matcher = housenumberPattern.matcher(houseId); if (matcher.find()) { queries.add( b.keyword().onField("hausnrzahl").boostedTo(50).matching(matcher.group(1)).createQuery()); } if (houseId.matches(".*\\D.*")) { queries.add(b.keyword().onField("houseIdExact").matching(houseId).createQuery()); } queries.add(b.keyword().onField("houseId").boostedTo(20).matching(houseId).createQuery()); TextAnalyzer analyzer = new TextAnalyzer(); TokenStream stream; try { stream = analyzer.tokenStream(null, new StringReader(houseId)); // CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { // if analyzer does not remove everything check hofname and hausnrgebaeudebez queries.add(b.keyword().onField("hofname").matching(houseId).createQuery()); queries.add(b.keyword().onField("hausnrgebaeudebez").matching(houseId).createQuery()); // System.out.println(cattr.toString()); } stream.end(); stream.close(); } catch (IOException e1) { e1.printStackTrace(); } analyzer.close(); } if (place != null && place.length() > 0) { queries.add(b.keyword().onField("place").matching(place).createQuery()); queries.add(b.keyword().onField("municipalityExact").boostedTo(20).matching(place).createQuery()); queries.add(b.keyword().onField("placeExact").boostedTo(5).matching(place).createQuery()); } @SuppressWarnings("rawtypes") BooleanJunction bq = b.bool(); for (Query item : queries) { bq = bq.should(item); } FullTextQuery fullTextQuery = fullTextEm.createFullTextQuery(bq.createQuery(), AdresseDenormalized.class); return fullTextQuery; }
From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java
License:Apache License
public List<String> getTermList(String contentText) { List<String> result = new ArrayList<String>(contentText.length() / 10); try {// www . j a v a2s . c o m TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = termAtt.toString(); result.add(term); } } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }
From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java
public List<String> search(String field, String value) { try {//from w ww . j a va 2s . c o m long start = System.currentTimeMillis(); TokenStream stream = analyzer.tokenStream(field, new StringReader(value)); CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class); stream.reset(); String valor = ""; while (stream.incrementToken()) { valor = valor + attr.toString() + ' '; } BooleanQuery bq = new BooleanQuery(); BooleanQuery acronymBq = null; String query = ""; BooleanQuery wrapBq = new BooleanQuery(); String[] tokens = valor.split(" "); for (int i = 0; i < tokens.length; i++) { if (tokens.length >= 2) { acronymBq = new BooleanQuery(); switch (i) { case 0: acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); break; case 1: acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT); bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT); break; default: break; } } else { if (tokens[i].length() > 3) { bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } else { bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } } } stream.end(); stream.close(); // Aqui termina // Cria uma fuzzyquery, ela que far a busca de aproximao wrapBq.add(bq, BooleanClause.Occur.MUST); if (acronymBq != null) { //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query) wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT); } String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms"; // Pegando os documentos encontrado na pesquisa start = System.currentTimeMillis(); ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs; String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms"; List<String> result = new ArrayList<String>(); result.add(valor); if (hits.length > 0) { for (int i = 0; i < hits.length; i++) { Document hitDoc = searcher.doc(hits[i].doc); result.add(hitDoc.get(field)); } } result.add(queryTime); result.add(searchTime); return result; } catch (IOException ex) { Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public boolean contain(String label) { try {//from ww w. ja v a2 s .c o m IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // use the boolean query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.MUST); } ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label); searcher.search(bq, collector); boolean ret = collector.isExistQueryLabel(); reader.close(); return ret; } catch (Exception e) { e.printStackTrace(); } return false; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) { TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>(); if (query == null) { ret.add(new SimilarLabelQueryResult(null, 1)); return ret; }//www . ja v a2 s . com try { IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // get terms from query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); SynonymMap synMap = SynonymIndex.getSynonymMap(); HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); // expand using synonyms for (String syn : synMap.getSynonyms(s)) { stemer.setCurrent(syn); stemer.stem(); syn = stemer.getCurrent(); if (expandedQueryTermSet.add(syn)) { term = new Term(LabelDocument.FIELD_LABEL, syn); termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); } } } // search in the label index SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet, similarity); searcher.search(bq, collector); ret = collector.getQueryResult(); searcher.close(); reader.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }