List of usage examples for org.apache.lucene.analysis Analyzer close
@Override public void close()
From source file:ClassifierHD.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(/*from ww w .ja v a 2 s . c o m*/ "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; String inputDir = args[5]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection conn = null; PreparedStatement pstmt = null; try { Class.forName("org.postgresql.Driver"); conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); conn.setAutoCommit(false); String sql = "INSERT INTO " + tablename + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);"; pstmt = conn.prepareStatement(sql); FileSystem fs = FileSystem.get(configuration); FileStatus[] status = fs.listStatus(new Path(inputDir)); BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true))); for (int i = 0; i < status.length; i++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath()))); if (new String(status[i].getPath().getName()).equals("rep.list")) { continue; } int lv_HEAD = 1; int lv_cnt = 0; String lv_gtime = null; String lv_wtime = null; String lv_target = null; BigDecimal lv_num = null; String lv_link = null; String[] lv_args; String lv_line; StringBuilder lv_txt = new StringBuilder(); while ((lv_line = br.readLine()) != null) { if (lv_cnt < lv_HEAD) { lv_args = lv_line.split(","); lv_gtime = lv_args[0]; lv_wtime = lv_args[1]; lv_target = lv_args[2]; lv_num = new BigDecimal(lv_args[3]); lv_link = lv_args[4]; } else { lv_txt.append(lv_line + '\n'); } lv_cnt++; } br.close(); String id = status[i].getPath().getName(); String message = lv_txt.toString(); Multiset<String> words = ConcurrentHashMultiset.create(); TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); if (wordId != null) { words.add(word); wordCount++; } } } ts.end(); ts.close(); Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } //System.out.println(message); //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId)); pstmt.setString(1, id); pstmt.setString(2, lv_gtime); pstmt.setString(3, lv_wtime); pstmt.setString(4, lv_target); pstmt.setBigDecimal(5, lv_num); pstmt.setString(6, lv_link); pstmt.setString(7, message.substring(1, Math.min(50, message.length()))); pstmt.setString(8, labels.get(bestCategoryId)); pstmt.addBatch(); bw.write(id + "\t" + labels.get(bestCategoryId) + "\n"); } pstmt.executeBatch(); //pstmt.clearParameters(); pstmt.close(); conn.commit(); conn.close(); bw.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } analyzer.close(); }
From source file:PostgresClassifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(/*from ww w .j av a 2s.c o m*/ "Arguments: [model] [label index] [dictionnary] [document frequency] [input postgres table]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection c = null; Statement stmt = null; Statement stmtU = null; try { Class.forName("org.postgresql.Driver"); c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); c.setAutoCommit(false); System.out.println("Opened database successfully"); stmt = c.createStatement(); stmtU = c.createStatement(); ResultSet rs = stmt.executeQuery("SELECT * FROM " + tablename + " WHERE rep is null"); while (rs.next()) { String seq = rs.getString("seq"); //String rep = rs.getString("rep"); String body = rs.getString("body"); //String category = rep; String id = seq; String message = body; //System.out.println("Doc: " + id + "\t" + message); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Mark : Modified ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } //System.out.print(" " + labels.get(categoryId) + ": " + score); } //System.out.println(" => " + labels.get(bestCategoryId)); //System.out.println("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id ); stmtU.executeUpdate("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id); } rs.close(); stmt.close(); stmtU.close(); c.commit(); c.close(); analyzer.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } }
From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java
License:Open Source License
/** * Filter the string with StandardAnalyzer. * @param str/* ww w . j ava 2 s. c om*/ * @param removeStopWords Indicate if the stop words should be removed. * @return */ public static String processString(String str, boolean removeStopWords) { StringBuffer strBuf = new StringBuffer(); try { Analyzer analyzer = null; if (removeStopWords) analyzer = new StandardAnalyzer(Version.LUCENE_34); else analyzer = new TextAnalyzerWithStopwords(Version.LUCENE_34); TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(str)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); strBuf.append(term + " "); } analyzer.close(); } catch (Exception e) { e.printStackTrace(); } return strBuf.toString().trim(); }
From source file:com.bizosys.hsearch.inpipe.TokenizeStandard.java
License:Apache License
public void visit(Object docObj, boolean multiWriter) throws ApplicationFault, SystemFault { if (null == docObj) throw new ApplicationFault("No document"); Doc doc = (Doc) docObj; doc.readers = super.getReaders(doc); if (null == doc) return;/* ww w . j av a2s. c om*/ try { Analyzer analyzer = new StandardAnalyzer(LuceneConstants.version); for (ReaderType reader : doc.readers) { if (null == doc.terms) doc.terms = new DocTerms(); TokenStream stream = analyzer.tokenStream(reader.type, reader.reader); TermStream ts = new TermStream(reader.docSection, stream, reader.type); doc.terms.addTokenStream(ts); //reader.reader stream is used during token computing. } analyzer.close(); } catch (Exception ex) { throw new SystemFault(ex); } }
From source file:com.bizosys.unstructured.Example.java
License:Apache License
public static void main(String[] args) throws Exception { SearchConfiguration conf = SearchConfiguration.getInstance(); Map<String, Integer> docTypes = new HashMap<String, Integer>(); docTypes.put("emp", 1); conf.instantiateDocumentTypeCodes(docTypes); Map<String, Integer> fldTypes = new HashMap<String, Integer>(); fldTypes.put("id", 1); fldTypes.put("name", 2); fldTypes.put("city", 3); fldTypes.put("description", 4); conf.instantiateFieldTypeCodes(fldTypes); Document doc = new Document(); doc.add(new Field("id", "DOC 001", Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.add(new Field("name", "Abinash", Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.add(new Field("city", "Abinash Bangalore", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("description", "Abinash works in Big Data. Abinash also plays badminton", Field.Store.NO, Field.Index.ANALYZED)); String INDEX_NAME = "Documents"; AnalyzerFactory.getInstance().setDefaultAnalyzer(getAnalyzer()); IndexWriter writer = new IndexWriter(new HSearchTableDocuments()); try {// ww w .j a v a 2s.c om writer.addDocument(1, doc, "emp", AnalyzerFactory.getInstance()); } finally { AnalyzerFactory.getInstance().close(); } HBaseTableSchema.getInstance().getSchema(); //writer.commit("merge1", INDEX_NAME); if (null != writer) writer.close(); Analyzer qAnalyzer = AnalyzerFactory.getInstance().getAnalyzer("emp"); Map<String, String> multiqueryParts = new HashMap<String, String>(); String multiQuery = new IndexSearcher().searchQueryPartsFill(INDEX_NAME, "*", "*:Abinash", qAnalyzer, multiqueryParts); if (null != qAnalyzer) qAnalyzer.close(); Client ht = new Client(); ht.execute("demo-table", multiQuery, multiqueryParts); }
From source file:com.bizosys.unstructured.StopwordAndSynonymAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { Document doc = new Document(); doc.add(new Field("description", "dress/t-shirt dress for \"good boy\"", Field.Store.NO, Field.Index.ANALYZED)); Analyzer analyzer = new StopwordAndSynonymAnalyzer(); for (Fieldable field : doc.getFields()) { String query = "dress/t-shirt dress for \"good boy\""; StringReader sr = new StringReader(query); TokenStream stream = analyzer.tokenStream(field.name(), sr); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); if (DEBUG_ENABLED) { while (stream.incrementToken()) { IdSearchLog.l.debug("Term:" + termA.toString()); }//from ww w . j ava2s .c om } sr.close(); } analyzer.close(); }
From source file:com.chimpler.example.bayes.Classifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;//from ww w . j a va2 s .c om } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
private Query buildQuery(String field, String query, int min, int hashCount, int hashSetSize) throws IOException { Analyzer chain = createMinHashAnalyzer(min, hashCount, hashSetSize); ArrayList<String> tokens = getTokens(chain, field, query); chain.close(); BooleanQuery.Builder builder = new BooleanQuery.Builder(); for (String token : tokens) { builder.add(new ConstantScoreQuery(new TermQuery(new Term("text", token))), BooleanClause.Occur.SHOULD); }//from w w w . j av a 2s . co m return builder.build(); }
From source file:com.khepry.frackhem.entities.Blendeds.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;// w w w.j av a 2 s . c o m message = "Start Indexing Blendeds via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Blendeds", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Blendeds via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Chemicals.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;//from w w w . j av a 2 s .c om message = "Start Indexing Chemicals via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Chemicals", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Chemicals via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }