List of usage examples for org.apache.lucene.facet.taxonomy.directory DirectoryTaxonomyWriter DirectoryTaxonomyWriter
public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode) throws IOException
From source file:com.chimpler.example.FacetLuceneIndexer.java
License:Apache License
public static void main(String args[]) throws Exception { // if (args.length != 3) { // System.err.println("Parameters: [index directory] [taxonomy directory] [json file]"); // System.exit(1); // }//from w w w .ja va 2 s . com String indexDirectory = "index"; String taxonomyDirectory = "taxonomy"; String jsonFileName = "/home/qiuqiang/workspace/facet-lucene-example/books.json"; IndexWriterConfig writerConfig = new IndexWriterConfig(LUCENE_VERSION, new WhitespaceAnalyzer(LUCENE_VERSION)); writerConfig.setOpenMode(OpenMode.APPEND); IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(indexDirectory)), writerConfig); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(MMapDirectory.open(new File(taxonomyDirectory)), OpenMode.APPEND); TaxonomyReader taxonomyReader = new DirectoryTaxonomyReader(FSDirectory.open(new File(taxonomyDirectory))); String content = IOUtils.toString(new FileInputStream(jsonFileName)); JSONArray bookArray = new JSONArray(content); Field idField = new IntField("id", 0, Store.YES); Field titleField = new TextField("title", "", Store.YES); Field authorsField = new TextField("authors", "", Store.YES); Field bookCategoryField = new TextField("book_category", "", Store.YES); indexWriter.deleteAll(); FacetFields facetFields = new FacetFields(taxonomyWriter); for (int i = 0; i < bookArray.length(); i++) { Document document = new Document(); JSONObject book = bookArray.getJSONObject(i); int id = book.getInt("id"); String title = book.getString("title"); String bookCategory = book.getString("book_category"); List<CategoryPath> categoryPaths = new ArrayList<CategoryPath>(); String authorsString = ""; JSONArray authors = book.getJSONArray("authors"); for (int j = 0; j < authors.length(); j++) { String author = authors.getString(j); if (j > 0) { authorsString += ", "; } categoryPaths.add(new CategoryPath("author", author)); authorsString += author; } categoryPaths.add(new CategoryPath("book_category" + bookCategory, '/')); idField.setIntValue(id); titleField.setStringValue(title); authorsField.setStringValue(authorsString); bookCategoryField.setStringValue(bookCategory); facetFields.addFields(document, categoryPaths); document.add(idField); document.add(titleField); document.add(authorsField); document.add(bookCategoryField); indexWriter.addDocument(document); System.out.printf("Book: id=%d, title=%s, book_category=%s, authors=%s\n", id, title, bookCategory, authors); } taxonomyWriter.prepareCommit(); try { taxonomyWriter.commit(); } catch (Exception e) { taxonomyWriter.rollback(); } // taxonomyWriter.close(); // // indexWriter.commit(); // indexWriter.close(); String query = "story"; IndexReader indexReader = DirectoryReader.open(indexWriter, false); IndexReader indexReader2 = DirectoryReader.open(indexWriter, false); System.out.println(indexReader == indexReader2); IndexSearcher indexSearcher = new IndexSearcher(indexReader); TaxonomyReader newTaxonomyReader = DirectoryTaxonomyReader.openIfChanged(taxonomyReader); if (newTaxonomyReader != null) { TaxonomyReader tmp = taxonomyReader; taxonomyReader = newTaxonomyReader; tmp.close(); } else { System.out.println("null"); } ArrayList<FacetRequest> facetRequests = new ArrayList<FacetRequest>(); facetRequests.add(new CountFacetRequest(new CategoryPath("author"), 100)); facetRequests.add(new CountFacetRequest(new CategoryPath("book_category"), 100)); FacetSearchParams searchParams = new FacetSearchParams(facetRequests); ComplexPhraseQueryParser queryParser = new ComplexPhraseQueryParser(LUCENE_VERSION, "title", new StandardAnalyzer(LUCENE_VERSION)); Query luceneQuery = queryParser.parse(query); // Collectors to get top results and facets TopScoreDocCollector topScoreDocCollector = TopScoreDocCollector.create(10, true); FacetsCollector facetsCollector = FacetsCollector.create(searchParams, indexReader, taxonomyReader); indexSearcher.search(luceneQuery, MultiCollector.wrap(topScoreDocCollector, facetsCollector)); System.out.println("Found:"); for (ScoreDoc scoreDoc : topScoreDocCollector.topDocs().scoreDocs) { Document document = indexReader.document(scoreDoc.doc); System.out.printf("- book: id=%s, title=%s, book_category=%s, authors=%s, score=%f\n", document.get("id"), document.get("title"), document.get("book_category"), document.get("authors"), scoreDoc.score); } System.out.println("Facets:"); for (FacetResult facetResult : facetsCollector.getFacetResults()) { System.out.println("- " + facetResult.getFacetResultNode().label); for (FacetResultNode facetResultNode : facetResult.getFacetResultNode().subResults) { System.out.printf(" - %s (%f)\n", facetResultNode.label.toString(), facetResultNode.value); for (FacetResultNode subFacetResultNode : facetResultNode.subResults) { System.out.printf(" - %s (%f)\n", subFacetResultNode.label.toString(), subFacetResultNode.value); } } } taxonomyReader.close(); indexReader.close(); taxonomyWriter.commit(); taxonomyWriter.close(); indexWriter.commit(); indexWriter.close(); }
From source file:com.fuerve.villageelder.actions.results.SearchResultItemTest.java
License:Apache License
private void buildDummyIndex(final Directory indexDirectory, final Directory taxonomyDirectory) throws IOException { IndexWriterConfig iwc = new IndexWriterConfig(Lucene.LUCENE_VERSION, Lucene.getPerFieldAnalyzer()); iwc.setOpenMode(OpenMode.CREATE);/*from w ww. j a v a2s . com*/ IndexWriter iw = new IndexWriter(indexDirectory, iwc); TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); List<CategoryPath> categories = new ArrayList<CategoryPath>(); FacetFields facetFields = new FacetFields(tw); Document doc = new Document(); categories.clear(); doc.add(new StringField("Author", "foo", Store.YES)); categories.add(new CategoryPath("Author", "foo")); doc.add(new LongField("RevisionNumber", 50L, Store.YES)); doc.add(new StringField("Revision", "50", Store.YES)); doc.add(new TextField("Message", "stuff", Store.YES)); iw.addDocument(doc); facetFields.addFields(doc, categories); doc = new Document(); facetFields = new FacetFields(tw); categories.clear(); doc.add(new StringField("Author", "bar", Store.YES)); categories.add(new CategoryPath("Author", "bar")); doc.add(new LongField("RevisionNumber", 5000L, Store.YES)); doc.add(new StringField("Revision", "5000", Store.YES)); doc.add(new TextField("Message", "stuff", Store.YES)); iw.addDocument(doc); facetFields.addFields(doc, categories); tw.commit(); tw.close(); iw.commit(); iw.close(); }
From source file:com.fuerve.villageelder.indexing.IndexManager.java
License:Apache License
/** * Gets the writers for the regular and taxonomy indices ready to go. * @throws IOException A fatal exception occurred while trying to * construct the index writers./*from w ww . ja va 2 s. co m*/ */ private void initializeWriters() throws IOException { if (luceneVersion == null || analyzer == null) { throw new IllegalArgumentException("The Lucene version and the index analyzer were unspecified " + "when attempting to create the index writers"); } IndexWriterConfig iwc = new IndexWriterConfig(luceneVersion, analyzer); iwc.setOpenMode(openMode); indexWriter = new IndexWriter(indexDirectory, iwc); taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, openMode); }
From source file:com.fuerve.villageelder.search.SearcherTest.java
License:Apache License
/** * Test method for {@link com.fuerve.villageelder.search.Searcher#initializeSearch()}. *//*from w w w .j a v a 2 s.c om*/ @SuppressWarnings("unused") @Test public final void testInitializeSearch() throws Exception { // Gather declared fields. Field indexDirectoryField = Searcher.class.getDeclaredField("indexDirectory"); Field taxonomyDirectoryField = Searcher.class.getDeclaredField("taxonomyDirectory"); Field indexDirectoryNameField = Searcher.class.getDeclaredField("indexDirectoryName"); Field taxonomyDirectoryNameField = Searcher.class.getDeclaredField("taxonomyDirectoryName"); Field stringDirectoriesField = Searcher.class.getDeclaredField("stringDirectories"); Field initializedField = Searcher.class.getDeclaredField("initialized"); Field searchField = Searcher.class.getDeclaredField("search"); Field indexReaderField = Searcher.class.getDeclaredField("indexReader"); Field indexSearcherField = Searcher.class.getDeclaredField("indexSearcher"); Field taxonomyReaderField = Searcher.class.getDeclaredField("taxonomyReader"); indexDirectoryField.setAccessible(true); taxonomyDirectoryField.setAccessible(true); indexDirectoryNameField.setAccessible(true); taxonomyDirectoryNameField.setAccessible(true); stringDirectoriesField.setAccessible(true); initializedField.setAccessible(true); searchField.setAccessible(true); indexReaderField.setAccessible(true); indexSearcherField.setAccessible(true); taxonomyReaderField.setAccessible(true); // Setup Directory indexDirectoryExpected = new RAMDirectory(); Directory taxonomyDirectoryExpected = new RAMDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(Lucene.LUCENE_VERSION, Lucene.getPerFieldAnalyzer()); IndexWriter iw = new IndexWriter(indexDirectoryExpected, iwc); TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxonomyDirectoryExpected, OpenMode.CREATE); iw.commit(); tw.commit(); Searcher target = new Searcher(indexDirectoryExpected, taxonomyDirectoryExpected); target.initializeSearch(); // Gather field values. Directory indexDirectoryActual = (Directory) indexDirectoryField.get(target); Directory taxonomyDirectoryActual = (Directory) taxonomyDirectoryField.get(target); String indexDirectoryNameActual = (String) indexDirectoryNameField.get(target); String taxonomyDirectoryNameActual = (String) taxonomyDirectoryNameField.get(target); boolean stringDirectoriesActual = stringDirectoriesField.getBoolean(target); boolean initializedActual = initializedField.getBoolean(target); Search searchFieldActual = (Search) searchField.get(target); IndexReader indexReaderActual = (IndexReader) indexReaderField.get(target); IndexSearcher indexSearcherActual = (IndexSearcher) indexSearcherField.get(target); TaxonomyReader taxonomyReaderActual = (TaxonomyReader) taxonomyReaderField.get(target); // Test assertEquals(true, initializedActual); assertNotNull(indexReaderActual); assertNotNull(indexSearcherActual); assertNotNull(taxonomyReaderActual); // Finish tw.close(); iw.close(); }
From source file:com.fuerve.villageelder.search.SearchTest.java
License:Apache License
/** * Test method for {@link com.fuerve.villageelder.search.Search#getFacetsCollector(org.apache.lucene.index.DirectoryReader, org.apache.lucene.facet.taxonomy.TaxonomyReader)}. *//* ww w. j ava 2s .c o m*/ @Test @SuppressWarnings({ "unchecked", "unused" }) public final void testGetFacetsCollector() throws Exception { // Constants Field defaultSortField = Search.class.getDeclaredField("DEFAULT_SORT"); Field defaultFacetsField = Search.class.getDeclaredField("DEFAULT_FACETS"); Field defaultFacetStringsField = Search.class.getDeclaredField("DEFAULT_FACET_STRINGS"); Field defaultAnalyzerField = Search.class.getDeclaredField("DEFAULT_ANALYZER"); Field defaultHitsField = Search.class.getDeclaredField("DEFAULT_HITS"); defaultSortField.setAccessible(true); defaultFacetsField.setAccessible(true); defaultFacetStringsField.setAccessible(true); defaultAnalyzerField.setAccessible(true); defaultHitsField.setAccessible(true); final Sort defaultSort = (Sort) defaultSortField.get(null); final List<FacetRequest> defaultFacets = (List<FacetRequest>) defaultFacetsField.get(null); final Map<String, Integer> defaultFacetStrings = (Map<String, Integer>) defaultFacetStringsField.get(null); final Analyzer defaultAnalyzer = (Analyzer) defaultAnalyzerField.get(null); final int defaultHits = defaultHitsField.getInt(null); // Private members Field queryField = Search.class.getDeclaredField("query"); Field sortField = Search.class.getDeclaredField("sort"); Field facetsField = Search.class.getDeclaredField("facets"); queryField.setAccessible(true); sortField.setAccessible(true); facetsField.setAccessible(true); // Test setup QueryParser parser = getQueryParser(); Query queryExpected = parser.parse("test:foo"); List<FacetRequest> facetsExpected = new ArrayList<FacetRequest>(); Sort sortExpected = Sort.RELEVANCE; Search target = new Search(queryExpected, facetsExpected, sortExpected); target.addFacet("test", 100); // Gather fields Query queryActual = (Query) queryField.get(target); Sort sortActual = (Sort) sortField.get(target); List<FacetRequest> facetsActual = (List<FacetRequest>) facetsField.get(target); // Set up some dummy indices. Directory indexDirectory = new RAMDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(Lucene.LUCENE_VERSION, Lucene.getPerFieldAnalyzer()); IndexWriter iw = new IndexWriter(indexDirectory, iwc); Directory taxonomyDirectory = new RAMDirectory(); TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); iw.commit(); tw.commit(); // Test FacetsCollector actual = target.getFacetsCollector(DirectoryReader.open(indexDirectory), new DirectoryTaxonomyReader(taxonomyDirectory)); assertEquals("DocsOnlyCollector", actual.getClass().getSimpleName()); iw.close(); tw.close(); taxonomyDirectory.close(); }
From source file:com.khepry.frackhem.entities.Blendeds.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;//from w w w.j a va 2s.co m message = "Start Indexing Blendeds via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Blendeds", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Blendeds via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Chemicals.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;/* w ww .j a v a 2s .c o m*/ message = "Start Indexing Chemicals via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Chemicals", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Chemicals via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Reports.java
License:Apache License
public void indexViaLucene(String textPath, String textColSeparator, Map<String, Toxicity> toxicities, String... parseFields) throws IOException { String message;//w w w . j a v a 2s. com message = "Start Indexing Reports via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textPath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } if (indexFolder.exists() && taxonomyFolder.exists()) { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } Map<String, String> mapBreakFields = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapLevelFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); Map<String, Integer> mapColIndexes = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, ""); } pieces = levelFields.split(","); for (String levelField : pieces) { mapBreakFields.put(levelField, ""); mapLevelFields.put(levelField, ""); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } Map<String, Map<String, String>> mapToxValues = new LinkedHashMap<>(); for (String parseField : parseFields) { mapToxValues.put(parseField, new TreeMap<String, String>()); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; StringBuilder sbIndex = new StringBuilder(); StringBuilder sbLevel = new StringBuilder(); Integer outCount = 0; Integer rcdCount = 0; Boolean firstDataRecordHandled = false; BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { mapColIndexes.put(colHeader.trim(), i); i++; } } else { for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); // build up level-break values if (mapLevelFields.containsKey(key)) { mapLevelFields.put(key, value); } } } if (!firstDataRecordHandled) { mapBreakFields.putAll(mapLevelFields); firstDataRecordHandled = true; } // if there is a "level break" if (!mapLevelFields.equals(mapBreakFields)) { Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories .add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; sbIndex.setLength(0); for (String key : mapToxValues.keySet()) { mapToxValues.get(key).clear(); } taxonomyCategories.clear(); mapBreakFields.putAll(mapLevelFields); } // build up text index values for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); if (!value.equals("")) { // build up 'text' field index value if (mapIndexFields.containsKey(key)) { sbIndex.append(value); sbIndex.append(" "); } } } } // build up toxicity values for later level-break use if (mapColIndexes.containsKey(casEdfIdFieldName)) { Toxicity toxicity = toxicities.get(pieces[mapColIndexes.get(casEdfIdFieldName)].trim()); if (toxicity != null) { // build up recognized toxicity values String[] toxRValues = toxicity.getToxRecognized().split(","); for (String toxValue : toxRValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxRecognized").containsKey(toxValue)) { mapToxValues.get("toxRecognized").put(toxValue, toxValue); } } } // build up suspected toxicity values String[] toxSValues = toxicity.getToxSuspected().split(","); for (String toxValue : toxSValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxSuspected").containsKey(toxValue)) { mapToxValues.get("toxSuspected").put(toxValue, toxValue); } } } } } if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } } } br.close(); // handle end-of-file processing Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; message = "Records processed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } message = "Records indexed: " + outCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sbIndex.setLength(0); sbIndex.trimToSize(); sbLevel.setLength(0); sbLevel.trimToSize(); mapToxValues.clear(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyWriter.commit(); taxonomyWriter.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Reports via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Toxicities.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator) throws IOException { String message;//from ww w. j av a 2 s .c om message = "Start Indexing Toxicities via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxCasEdfId = pieces[0].trim(); String toxChemical = pieces[1].trim(); // categorize recognized toxicities String toxRecognized = pieces[2].trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = pieces[3].trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories .add(new CategoryPath("Toxicities", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } } message = "Ended Indexing Toxicities via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.orientechnologies.lucene.engine.OLuceneFacetManager.java
License:Apache License
protected void buildFacetIndexIfNeeded() throws IOException { if (metadata != null && metadata.containsField(FACET_FIELDS)) { ODatabaseDocumentInternal database = owner.getDatabase(); Iterable<String> iterable = metadata.field(FACET_FIELDS); if (iterable != null) { Directory dir = getTaxDirectory(database); taxonomyWriter = new DirectoryTaxonomyWriter(dir, IndexWriterConfig.OpenMode.CREATE_OR_APPEND); for (String s : iterable) { facetField = s;/* w ww . j a va 2s. com*/ // facetField = "facet_" + s; // facetDim = s; // config.setIndexFieldName(s, "facet_" + s); config.setHierarchical(s, true); } } } }