List of usage examples for org.apache.lucene.facet.taxonomy TaxonomyWriter commit
public long commit() throws IOException;
From source file:com.chimpler.example.FacetLuceneIndexer.java
License:Apache License
public static void main(String args[]) throws Exception { // if (args.length != 3) { // System.err.println("Parameters: [index directory] [taxonomy directory] [json file]"); // System.exit(1); // }/* w ww. ja v a 2 s . c o m*/ String indexDirectory = "index"; String taxonomyDirectory = "taxonomy"; String jsonFileName = "/home/qiuqiang/workspace/facet-lucene-example/books.json"; IndexWriterConfig writerConfig = new IndexWriterConfig(LUCENE_VERSION, new WhitespaceAnalyzer(LUCENE_VERSION)); writerConfig.setOpenMode(OpenMode.APPEND); IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(indexDirectory)), writerConfig); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(MMapDirectory.open(new File(taxonomyDirectory)), OpenMode.APPEND); TaxonomyReader taxonomyReader = new DirectoryTaxonomyReader(FSDirectory.open(new File(taxonomyDirectory))); String content = IOUtils.toString(new FileInputStream(jsonFileName)); JSONArray bookArray = new JSONArray(content); Field idField = new IntField("id", 0, Store.YES); Field titleField = new TextField("title", "", Store.YES); Field authorsField = new TextField("authors", "", Store.YES); Field bookCategoryField = new TextField("book_category", "", Store.YES); indexWriter.deleteAll(); FacetFields facetFields = new FacetFields(taxonomyWriter); for (int i = 0; i < bookArray.length(); i++) { Document document = new Document(); JSONObject book = bookArray.getJSONObject(i); int id = book.getInt("id"); String title = book.getString("title"); String bookCategory = book.getString("book_category"); List<CategoryPath> categoryPaths = new ArrayList<CategoryPath>(); String authorsString = ""; JSONArray authors = book.getJSONArray("authors"); for (int j = 0; j < authors.length(); j++) { String author = authors.getString(j); if (j > 0) { authorsString += ", "; } categoryPaths.add(new CategoryPath("author", author)); authorsString += author; } categoryPaths.add(new CategoryPath("book_category" + bookCategory, '/')); idField.setIntValue(id); titleField.setStringValue(title); authorsField.setStringValue(authorsString); bookCategoryField.setStringValue(bookCategory); facetFields.addFields(document, categoryPaths); document.add(idField); document.add(titleField); document.add(authorsField); document.add(bookCategoryField); indexWriter.addDocument(document); System.out.printf("Book: id=%d, title=%s, book_category=%s, authors=%s\n", id, title, bookCategory, authors); } taxonomyWriter.prepareCommit(); try { taxonomyWriter.commit(); } catch (Exception e) { taxonomyWriter.rollback(); } // taxonomyWriter.close(); // // indexWriter.commit(); // indexWriter.close(); String query = "story"; IndexReader indexReader = DirectoryReader.open(indexWriter, false); IndexReader indexReader2 = DirectoryReader.open(indexWriter, false); System.out.println(indexReader == indexReader2); IndexSearcher indexSearcher = new IndexSearcher(indexReader); TaxonomyReader newTaxonomyReader = DirectoryTaxonomyReader.openIfChanged(taxonomyReader); if (newTaxonomyReader != null) { TaxonomyReader tmp = taxonomyReader; taxonomyReader = newTaxonomyReader; tmp.close(); } else { System.out.println("null"); } ArrayList<FacetRequest> facetRequests = new ArrayList<FacetRequest>(); facetRequests.add(new CountFacetRequest(new CategoryPath("author"), 100)); facetRequests.add(new CountFacetRequest(new CategoryPath("book_category"), 100)); FacetSearchParams searchParams = new FacetSearchParams(facetRequests); ComplexPhraseQueryParser queryParser = new ComplexPhraseQueryParser(LUCENE_VERSION, "title", new StandardAnalyzer(LUCENE_VERSION)); Query luceneQuery = queryParser.parse(query); // Collectors to get top results and facets TopScoreDocCollector topScoreDocCollector = TopScoreDocCollector.create(10, true); FacetsCollector facetsCollector = FacetsCollector.create(searchParams, indexReader, taxonomyReader); indexSearcher.search(luceneQuery, MultiCollector.wrap(topScoreDocCollector, facetsCollector)); System.out.println("Found:"); for (ScoreDoc scoreDoc : topScoreDocCollector.topDocs().scoreDocs) { Document document = indexReader.document(scoreDoc.doc); System.out.printf("- book: id=%s, title=%s, book_category=%s, authors=%s, score=%f\n", document.get("id"), document.get("title"), document.get("book_category"), document.get("authors"), scoreDoc.score); } System.out.println("Facets:"); for (FacetResult facetResult : facetsCollector.getFacetResults()) { System.out.println("- " + facetResult.getFacetResultNode().label); for (FacetResultNode facetResultNode : facetResult.getFacetResultNode().subResults) { System.out.printf(" - %s (%f)\n", facetResultNode.label.toString(), facetResultNode.value); for (FacetResultNode subFacetResultNode : facetResultNode.subResults) { System.out.printf(" - %s (%f)\n", subFacetResultNode.label.toString(), subFacetResultNode.value); } } } taxonomyReader.close(); indexReader.close(); taxonomyWriter.commit(); taxonomyWriter.close(); indexWriter.commit(); indexWriter.close(); }
From source file:com.fuerve.villageelder.actions.results.SearchResultItemTest.java
License:Apache License
private void buildDummyIndex(final Directory indexDirectory, final Directory taxonomyDirectory) throws IOException { IndexWriterConfig iwc = new IndexWriterConfig(Lucene.LUCENE_VERSION, Lucene.getPerFieldAnalyzer()); iwc.setOpenMode(OpenMode.CREATE);//from w w w .j a va 2 s . co m IndexWriter iw = new IndexWriter(indexDirectory, iwc); TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); List<CategoryPath> categories = new ArrayList<CategoryPath>(); FacetFields facetFields = new FacetFields(tw); Document doc = new Document(); categories.clear(); doc.add(new StringField("Author", "foo", Store.YES)); categories.add(new CategoryPath("Author", "foo")); doc.add(new LongField("RevisionNumber", 50L, Store.YES)); doc.add(new StringField("Revision", "50", Store.YES)); doc.add(new TextField("Message", "stuff", Store.YES)); iw.addDocument(doc); facetFields.addFields(doc, categories); doc = new Document(); facetFields = new FacetFields(tw); categories.clear(); doc.add(new StringField("Author", "bar", Store.YES)); categories.add(new CategoryPath("Author", "bar")); doc.add(new LongField("RevisionNumber", 5000L, Store.YES)); doc.add(new StringField("Revision", "5000", Store.YES)); doc.add(new TextField("Message", "stuff", Store.YES)); iw.addDocument(doc); facetFields.addFields(doc, categories); tw.commit(); tw.close(); iw.commit(); iw.close(); }
From source file:com.fuerve.villageelder.search.SearcherTest.java
License:Apache License
/** * Test method for {@link com.fuerve.villageelder.search.Searcher#initializeSearch()}. *//* w w w. j a va 2 s . c om*/ @SuppressWarnings("unused") @Test public final void testInitializeSearch() throws Exception { // Gather declared fields. Field indexDirectoryField = Searcher.class.getDeclaredField("indexDirectory"); Field taxonomyDirectoryField = Searcher.class.getDeclaredField("taxonomyDirectory"); Field indexDirectoryNameField = Searcher.class.getDeclaredField("indexDirectoryName"); Field taxonomyDirectoryNameField = Searcher.class.getDeclaredField("taxonomyDirectoryName"); Field stringDirectoriesField = Searcher.class.getDeclaredField("stringDirectories"); Field initializedField = Searcher.class.getDeclaredField("initialized"); Field searchField = Searcher.class.getDeclaredField("search"); Field indexReaderField = Searcher.class.getDeclaredField("indexReader"); Field indexSearcherField = Searcher.class.getDeclaredField("indexSearcher"); Field taxonomyReaderField = Searcher.class.getDeclaredField("taxonomyReader"); indexDirectoryField.setAccessible(true); taxonomyDirectoryField.setAccessible(true); indexDirectoryNameField.setAccessible(true); taxonomyDirectoryNameField.setAccessible(true); stringDirectoriesField.setAccessible(true); initializedField.setAccessible(true); searchField.setAccessible(true); indexReaderField.setAccessible(true); indexSearcherField.setAccessible(true); taxonomyReaderField.setAccessible(true); // Setup Directory indexDirectoryExpected = new RAMDirectory(); Directory taxonomyDirectoryExpected = new RAMDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(Lucene.LUCENE_VERSION, Lucene.getPerFieldAnalyzer()); IndexWriter iw = new IndexWriter(indexDirectoryExpected, iwc); TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxonomyDirectoryExpected, OpenMode.CREATE); iw.commit(); tw.commit(); Searcher target = new Searcher(indexDirectoryExpected, taxonomyDirectoryExpected); target.initializeSearch(); // Gather field values. Directory indexDirectoryActual = (Directory) indexDirectoryField.get(target); Directory taxonomyDirectoryActual = (Directory) taxonomyDirectoryField.get(target); String indexDirectoryNameActual = (String) indexDirectoryNameField.get(target); String taxonomyDirectoryNameActual = (String) taxonomyDirectoryNameField.get(target); boolean stringDirectoriesActual = stringDirectoriesField.getBoolean(target); boolean initializedActual = initializedField.getBoolean(target); Search searchFieldActual = (Search) searchField.get(target); IndexReader indexReaderActual = (IndexReader) indexReaderField.get(target); IndexSearcher indexSearcherActual = (IndexSearcher) indexSearcherField.get(target); TaxonomyReader taxonomyReaderActual = (TaxonomyReader) taxonomyReaderField.get(target); // Test assertEquals(true, initializedActual); assertNotNull(indexReaderActual); assertNotNull(indexSearcherActual); assertNotNull(taxonomyReaderActual); // Finish tw.close(); iw.close(); }
From source file:com.fuerve.villageelder.search.SearchTest.java
License:Apache License
/** * Test method for {@link com.fuerve.villageelder.search.Search#getFacetsCollector(org.apache.lucene.index.DirectoryReader, org.apache.lucene.facet.taxonomy.TaxonomyReader)}. */// ww w. j av a 2 s . c o m @Test @SuppressWarnings({ "unchecked", "unused" }) public final void testGetFacetsCollector() throws Exception { // Constants Field defaultSortField = Search.class.getDeclaredField("DEFAULT_SORT"); Field defaultFacetsField = Search.class.getDeclaredField("DEFAULT_FACETS"); Field defaultFacetStringsField = Search.class.getDeclaredField("DEFAULT_FACET_STRINGS"); Field defaultAnalyzerField = Search.class.getDeclaredField("DEFAULT_ANALYZER"); Field defaultHitsField = Search.class.getDeclaredField("DEFAULT_HITS"); defaultSortField.setAccessible(true); defaultFacetsField.setAccessible(true); defaultFacetStringsField.setAccessible(true); defaultAnalyzerField.setAccessible(true); defaultHitsField.setAccessible(true); final Sort defaultSort = (Sort) defaultSortField.get(null); final List<FacetRequest> defaultFacets = (List<FacetRequest>) defaultFacetsField.get(null); final Map<String, Integer> defaultFacetStrings = (Map<String, Integer>) defaultFacetStringsField.get(null); final Analyzer defaultAnalyzer = (Analyzer) defaultAnalyzerField.get(null); final int defaultHits = defaultHitsField.getInt(null); // Private members Field queryField = Search.class.getDeclaredField("query"); Field sortField = Search.class.getDeclaredField("sort"); Field facetsField = Search.class.getDeclaredField("facets"); queryField.setAccessible(true); sortField.setAccessible(true); facetsField.setAccessible(true); // Test setup QueryParser parser = getQueryParser(); Query queryExpected = parser.parse("test:foo"); List<FacetRequest> facetsExpected = new ArrayList<FacetRequest>(); Sort sortExpected = Sort.RELEVANCE; Search target = new Search(queryExpected, facetsExpected, sortExpected); target.addFacet("test", 100); // Gather fields Query queryActual = (Query) queryField.get(target); Sort sortActual = (Sort) sortField.get(target); List<FacetRequest> facetsActual = (List<FacetRequest>) facetsField.get(target); // Set up some dummy indices. Directory indexDirectory = new RAMDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(Lucene.LUCENE_VERSION, Lucene.getPerFieldAnalyzer()); IndexWriter iw = new IndexWriter(indexDirectory, iwc); Directory taxonomyDirectory = new RAMDirectory(); TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); iw.commit(); tw.commit(); // Test FacetsCollector actual = target.getFacetsCollector(DirectoryReader.open(indexDirectory), new DirectoryTaxonomyReader(taxonomyDirectory)); assertEquals("DocsOnlyCollector", actual.getClass().getSimpleName()); iw.close(); tw.close(); taxonomyDirectory.close(); }
From source file:com.khepry.frackhem.entities.Blendeds.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;//from w w w . j av a 2 s . c o m message = "Start Indexing Blendeds via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Blendeds", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Blendeds via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Chemicals.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;//from w w w . ja v a2s .c o m message = "Start Indexing Chemicals via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Chemicals", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Chemicals via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Reports.java
License:Apache License
public void indexViaLucene(String textPath, String textColSeparator, Map<String, Toxicity> toxicities, String... parseFields) throws IOException { String message;//w w w . jav a 2s .c o m message = "Start Indexing Reports via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textPath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } if (indexFolder.exists() && taxonomyFolder.exists()) { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } Map<String, String> mapBreakFields = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapLevelFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); Map<String, Integer> mapColIndexes = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, ""); } pieces = levelFields.split(","); for (String levelField : pieces) { mapBreakFields.put(levelField, ""); mapLevelFields.put(levelField, ""); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } Map<String, Map<String, String>> mapToxValues = new LinkedHashMap<>(); for (String parseField : parseFields) { mapToxValues.put(parseField, new TreeMap<String, String>()); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; StringBuilder sbIndex = new StringBuilder(); StringBuilder sbLevel = new StringBuilder(); Integer outCount = 0; Integer rcdCount = 0; Boolean firstDataRecordHandled = false; BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { mapColIndexes.put(colHeader.trim(), i); i++; } } else { for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); // build up level-break values if (mapLevelFields.containsKey(key)) { mapLevelFields.put(key, value); } } } if (!firstDataRecordHandled) { mapBreakFields.putAll(mapLevelFields); firstDataRecordHandled = true; } // if there is a "level break" if (!mapLevelFields.equals(mapBreakFields)) { Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories .add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; sbIndex.setLength(0); for (String key : mapToxValues.keySet()) { mapToxValues.get(key).clear(); } taxonomyCategories.clear(); mapBreakFields.putAll(mapLevelFields); } // build up text index values for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); if (!value.equals("")) { // build up 'text' field index value if (mapIndexFields.containsKey(key)) { sbIndex.append(value); sbIndex.append(" "); } } } } // build up toxicity values for later level-break use if (mapColIndexes.containsKey(casEdfIdFieldName)) { Toxicity toxicity = toxicities.get(pieces[mapColIndexes.get(casEdfIdFieldName)].trim()); if (toxicity != null) { // build up recognized toxicity values String[] toxRValues = toxicity.getToxRecognized().split(","); for (String toxValue : toxRValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxRecognized").containsKey(toxValue)) { mapToxValues.get("toxRecognized").put(toxValue, toxValue); } } } // build up suspected toxicity values String[] toxSValues = toxicity.getToxSuspected().split(","); for (String toxValue : toxSValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxSuspected").containsKey(toxValue)) { mapToxValues.get("toxSuspected").put(toxValue, toxValue); } } } } } if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } } } br.close(); // handle end-of-file processing Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; message = "Records processed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } message = "Records indexed: " + outCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sbIndex.setLength(0); sbIndex.trimToSize(); sbLevel.setLength(0); sbLevel.trimToSize(); mapToxValues.clear(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyWriter.commit(); taxonomyWriter.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Reports via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Toxicities.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator) throws IOException { String message;//from w w w.j av a 2 s . c om message = "Start Indexing Toxicities via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxCasEdfId = pieces[0].trim(); String toxChemical = pieces[1].trim(); // categorize recognized toxicities String toxRecognized = pieces[2].trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = pieces[3].trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories .add(new CategoryPath("Toxicities", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } } message = "Ended Indexing Toxicities via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
protected void addDocument(Study study) throws IOException { StudyVersion sv = null;// w w w . jav a 2 s . c om if (study.getReleasedVersion() != null) { sv = study.getReleasedVersion(); Metadata metadata = sv.getMetadata(); Document doc = new Document(); logger.fine("Start indexing study " + study.getStudyId()); addText(4.0f, doc, "title", metadata.getTitle()); addKeyword(doc, "id", study.getId().toString()); addText(1.0f, doc, "studyId", study.getStudyId()); addKeyword(doc, "studyId", study.getStudyId()); // addText(1.0f, doc,"owner",study.getOwner().getName()); addText(1.0f, doc, "dvOwnerId", Long.toString(study.getOwner().getId())); String dvNetworkId = study.getOwner().getVdcNetwork().getId().toString(); /* This is the ID of the DV Network to which the study belongs * directly, through its owner DV: */ addText(1.0f, doc, "ownerDvNetworkId", dvNetworkId); /* Plus it may belong to these extra Networks, through linking into * collections in DVs that belong to other Networks: */ logger.fine("Using network id " + dvNetworkId); addText(1.0f, doc, "dvNetworkId", dvNetworkId); List<Long> linkedToNetworks = study.getLinkedToNetworkIds(); if (linkedToNetworks != null) { for (Long vdcnetworkid : linkedToNetworks) { addText(1.0f, doc, "dvNetworkId", vdcnetworkid.toString()); } } addDate(1.0f, doc, "productionDate", metadata.getProductionDate()); addDate(1.0f, doc, "distributionDate", metadata.getDistributionDate()); Collection<StudyKeyword> keywords = metadata.getStudyKeywords(); for (Iterator it = keywords.iterator(); it.hasNext();) { StudyKeyword elem = (StudyKeyword) it.next(); addText(1.0f, doc, "keywordValue", elem.getValue()); } Collection<StudyTopicClass> topicClassifications = metadata.getStudyTopicClasses(); for (Iterator it = topicClassifications.iterator(); it.hasNext();) { StudyTopicClass elem = (StudyTopicClass) it.next(); addText(1.0f, doc, "topicClassValue", elem.getValue()); addText(1.0f, doc, "topicVocabClassURI", elem.getVocabURI()); addText(1.0f, doc, "topicClassVocabulary", elem.getVocab()); } Collection<StudyAbstract> abstracts = metadata.getStudyAbstracts(); for (Iterator it = abstracts.iterator(); it.hasNext();) { StudyAbstract elem = (StudyAbstract) it.next(); addText(2.0f, doc, "abstractText", elem.getText()); addDate(1.0f, doc, "abstractDate", elem.getDate()); } Collection<StudyAuthor> studyAuthors = metadata.getStudyAuthors(); for (Iterator it = studyAuthors.iterator(); it.hasNext();) { StudyAuthor elem = (StudyAuthor) it.next(); addText(3.0f, doc, "authorName", elem.getName()); addText(1.0f, doc, "authorName", elem.getName()); addText(1.0f, doc, "authorAffiliation", elem.getAffiliation()); } Collection<StudyProducer> studyProducers = metadata.getStudyProducers(); for (Iterator itProducers = studyProducers.iterator(); itProducers.hasNext();) { StudyProducer studyProducer = (StudyProducer) itProducers.next(); addText(1.0f, doc, "producerName", studyProducer.getName()); addText(1.0f, doc, "producerName", studyProducer.getAbbreviation()); addText(1.0f, doc, "producerName", studyProducer.getLogo()); addText(1.0f, doc, "producerName", studyProducer.getUrl()); addText(1.0f, doc, "producerName", studyProducer.getAffiliation()); addText(1.0f, doc, "producerName", studyProducer.getMetadata().getProductionPlace()); } Collection<StudyDistributor> studyDistributors = metadata.getStudyDistributors(); for (Iterator it = studyDistributors.iterator(); it.hasNext();) { StudyDistributor studyDistributor = (StudyDistributor) it.next(); addText(1.0f, doc, "distributorName", studyDistributor.getName()); addText(1.0f, doc, "distributorName", studyDistributor.getAbbreviation()); addText(1.0f, doc, "distributorName", studyDistributor.getLogo()); addText(1.0f, doc, "distributorName", studyDistributor.getUrl()); addText(1.0f, doc, "distributorName", studyDistributor.getAffiliation()); } Collection<StudyOtherId> otherIds = metadata.getStudyOtherIds(); for (Iterator it = otherIds.iterator(); it.hasNext();) { StudyOtherId elem = (StudyOtherId) it.next(); addText(1.0f, doc, "otherId", elem.getOtherId()); addText(1.0f, doc, "otherIdAgency", elem.getAgency()); } addText(1.0f, doc, "fundingAgency", metadata.getFundingAgency()); addText(1.0f, doc, "distributorContact", metadata.getDistributorContact()); addText(1.0f, doc, "distributorContactAffiliation", metadata.getDistributorContactAffiliation()); addText(1.0f, doc, "distributorContactEmail", metadata.getDistributorContactEmail()); addDate(1.0f, doc, "dateOfDeposit", metadata.getDateOfDeposit()); addText(1.0f, doc, "depositor", metadata.getDepositor()); addText(1.0f, doc, "seriesName", metadata.getSeriesName()); addText(1.0f, doc, "seriesInformation", metadata.getSeriesInformation()); addText(1.0f, doc, "studyVersion", metadata.getStudyVersionText()); addText(1.0f, doc, "versionDate", metadata.getVersionDate()); addText(1.0f, doc, "originOfSources", metadata.getOriginOfSources()); addText(1.0f, doc, "dataSources", metadata.getDataSources()); addText(1.0f, doc, "frequencyOfDataCollection", metadata.getFrequencyOfDataCollection()); addText(1.0f, doc, "universe", metadata.getUniverse()); addText(1.0f, doc, "unitOfAnalysis", metadata.getUnitOfAnalysis()); addText(1.0f, doc, "dataCollector", metadata.getDataCollector()); addText(1.0f, doc, "kindOfData", metadata.getKindOfData()); addText(1.0f, doc, "geographicCoverage", metadata.getGeographicCoverage()); addText(1.0f, doc, "geographicUnit", metadata.getGeographicUnit()); addDate(1.0f, doc, "timePeriodCoveredEnd", metadata.getTimePeriodCoveredEnd()); addDate(1.0f, doc, "timePeriodCoveredStart", metadata.getTimePeriodCoveredStart()); addDate(1.0f, doc, "dateOfCollection", metadata.getDateOfCollectionStart()); addDate(1.0f, doc, "dateOfCollectionEnd", metadata.getDateOfCollectionEnd()); addText(1.0f, doc, "country", metadata.getCountry()); addText(1.0f, doc, "timeMethod", metadata.getTimeMethod()); addText(1.0f, doc, "samplingProcedure", metadata.getSamplingProcedure()); addText(1.0f, doc, "deviationsFromSampleDesign", metadata.getDeviationsFromSampleDesign()); addText(1.0f, doc, "collectionMode", metadata.getCollectionMode()); addText(1.0f, doc, "researchInstrument", metadata.getResearchInstrument()); addText(1.0f, doc, "characteristicOfSources", metadata.getCharacteristicOfSources()); addText(1.0f, doc, "accessToSources", metadata.getAccessToSources()); addText(1.0f, doc, "dataCollectionSituation", metadata.getDataCollectionSituation()); addText(1.0f, doc, "actionsToMinimizeLoss", metadata.getActionsToMinimizeLoss()); addText(1.0f, doc, "controlOperations", metadata.getControlOperations()); addText(1.0f, doc, "weighting", metadata.getWeighting()); addText(1.0f, doc, "cleaningOperations", metadata.getCleaningOperations()); addText(1.0f, doc, "studyLevelErrorNotes", metadata.getStudyLevelErrorNotes()); List<StudyNote> studyNotes = metadata.getStudyNotes(); for (Iterator it = studyNotes.iterator(); it.hasNext();) { StudyNote elem = (StudyNote) it.next(); addText(1.0f, doc, "studyNoteType", elem.getType()); addText(1.0f, doc, "studyNoteSubject", elem.getSubject()); addText(1.0f, doc, "studyNoteText", elem.getText()); } addText(1.0f, doc, "responseRate", metadata.getResponseRate()); addText(1.0f, doc, "samplingErrorEstimate", metadata.getSamplingErrorEstimate()); addText(1.0f, doc, "otherDataAppraisal", metadata.getOtherDataAppraisal()); addText(1.0f, doc, "placeOfAccess", metadata.getPlaceOfAccess()); addText(1.0f, doc, "originalArchive", metadata.getOriginalArchive()); addText(1.0f, doc, "availabilityStatus", metadata.getAvailabilityStatus()); addText(1.0f, doc, "collectionSize", metadata.getCollectionSize()); addText(1.0f, doc, "studyCompletion", metadata.getStudyCompletion()); addText(1.0f, doc, "confidentialityDeclaration", metadata.getConfidentialityDeclaration()); addText(1.0f, doc, "specialPermissions", metadata.getSpecialPermissions()); addText(1.0f, doc, "restrictions", metadata.getRestrictions()); addText(1.0f, doc, "contact", metadata.getContact()); addText(1.0f, doc, "citationRequirements", metadata.getCitationRequirements()); addText(1.0f, doc, "depositorRequirements", metadata.getDepositorRequirements()); addText(1.0f, doc, "conditions", metadata.getConditions()); addText(1.0f, doc, "disclaimer", metadata.getDisclaimer()); List<StudyRelMaterial> relMaterials = metadata.getStudyRelMaterials(); for (Iterator it = relMaterials.iterator(); it.hasNext();) { StudyRelMaterial elem = (StudyRelMaterial) it.next(); addText(1.0f, doc, "relatedMaterial", elem.getText()); } List<StudyRelStudy> relStudies = metadata.getStudyRelStudies(); for (Iterator it = relStudies.iterator(); it.hasNext();) { StudyRelStudy elem = (StudyRelStudy) it.next(); addText(1.0f, doc, "relatedStudy", elem.getText()); } List<StudyOtherRef> otherRefs = metadata.getStudyOtherRefs(); for (Iterator it = otherRefs.iterator(); it.hasNext();) { StudyOtherRef elem = (StudyOtherRef) it.next(); addText(1.0f, doc, "otherReferences", elem.getText()); } for (StudyRelPublication elem : metadata.getStudyRelPublications()) { String publicationId = (elem.getIdType() != null ? elem.getIdType() + ":" : "") + elem.getIdNumber(); if (elem.isReplicationData()) { addText(1.0f, doc, "replicationFor", elem.getText()); addText(1.0f, doc, "replicationForId", publicationId); addText(1.0f, doc, "replicationForURL", elem.getUrl()); } else { addText(1.0f, doc, "relatedPublications", elem.getText()); addText(1.0f, doc, "relatedPublicationsId", publicationId); addText(1.0f, doc, "relatedPublicationsURL", elem.getUrl()); } } /* addText(1.0f, doc,"relatedMaterial",metadata.getRelatedMaterial()); addText(1.0f, doc,"relatedPublications",metadata.getRelatedPublications()); addText(1.0f, doc,"otherReferences",metadata.getOtherReferences()); */ addText(1.0f, doc, "subtitle", metadata.getSubTitle()); List<StudyKeyword> studyKeywords = metadata.getStudyKeywords(); for (Iterator it = studyKeywords.iterator(); it.hasNext();) { StudyKeyword elem = (StudyKeyword) it.next(); addText(1.0f, doc, "keywordVocabulary", elem.getVocab()); addText(1.0f, doc, "keywordVocabulary", elem.getVocabURI()); } addText(1.0f, doc, "protocol", study.getProtocol()); addText(1.0f, doc, "authority", study.getAuthority()); addText(1.0f, doc, "globalId", study.getGlobalId()); List<StudySoftware> studySoftware = metadata.getStudySoftware(); for (Iterator it = studySoftware.iterator(); it.hasNext();) { StudySoftware elem = (StudySoftware) it.next(); addText(1.0f, doc, "studySoftware", elem.getName()); addText(1.0f, doc, "studySoftwareVersion", elem.getSoftwareVersion()); } List<StudyGrant> studyGrants = metadata.getStudyGrants(); for (Iterator it = studyGrants.iterator(); it.hasNext();) { StudyGrant elem = (StudyGrant) it.next(); addText(1.0f, doc, "studyGrantNumber", elem.getNumber()); addText(1.0f, doc, "studyGrantNumberAgency", elem.getAgency()); } List<StudyGeoBounding> studyGeoBounding = metadata.getStudyGeoBoundings(); for (Iterator it = studyGeoBounding.iterator(); it.hasNext();) { StudyGeoBounding elem = (StudyGeoBounding) it.next(); addText(1.0f, doc, "studyEastLongitude", elem.getEastLongitude()); addText(1.0f, doc, "studyWestLongitude", elem.getWestLongitude()); addText(1.0f, doc, "studyNorthLatitude", elem.getNorthLatitude()); addText(1.0f, doc, "studySouthLatitude", elem.getSouthLatitude()); } // Extented metadata fields: String templateName = metadata.getStudy().getTemplate().getName(); for (StudyFieldValue extFieldValue : metadata.getStudyFieldValues()) { try { StudyField extStudyField = extFieldValue.getStudyField(); String extFieldName = extStudyField.getName(); String extFieldStrValue = extFieldValue.getStrValue(); if (extFieldName != null && !extFieldName.equals("") && extFieldStrValue != null && !extFieldStrValue.equals("")) { addText(2.0f, doc, extFieldName, extFieldStrValue); // Whenever we encounter an extended field actually // used in a study metadata, we want it to be searchable, // on the "Advanced Search" page: (or do we?) //extFieldValue.getTemplateField().getStudyField().setAdvancedSearchField(true); // note that the above will only control the appearance of the // field on the Network-level Advanced Search page. (that // page uses the default list of advanced search fields, // which is simply all the lists from the StudyField DB // table where isAdvancedField=true. Individual DVs // have their own lists of advanced fields. // As of now, we will make the field "advanced" only in // its own dataverse: // (this is to be reviewed with Merce next week -- L.A. Feb. 22, 2012) if (!metadata.getStudy().getOwner().getAdvSearchFields().contains(extStudyField)) { metadata.getStudy().getOwner().getAdvSearchFields().add(extStudyField); } } } catch (Exception ex) { // do nothing - if we can't retrieve the field, we are // not going to index it, that's all. } } for (FileMetadata fileMetadata : sv.getFileMetadatas()) { addText(1.0f, doc, "fileDescription", fileMetadata.getDescription()); } addText(1.0f, doc, "unf", metadata.getUNF()); // writer = new IndexWriter(dir, true, getAnalyzer(), isIndexEmpty()); logger.fine("Indexing study db id " + study.getId() + " (" + study.getStudyId() + ": " + metadata.getTitle() + ") from dataverse id " + study.getOwner().getId() + " (" + study.getOwner().getAlias() + ")"); writer = new IndexWriter(dir, getAnalyzer(), isIndexEmpty(), IndexWriter.MaxFieldLength.UNLIMITED); writer.setUseCompoundFile(true); TaxonomyWriter taxo = new DirectoryTaxonomyWriter(taxoDir); List<CategoryPath> categoryPaths = new ArrayList<CategoryPath>(); addFacet(categoryPaths, "dvName", study.getOwner().getName()); addFacetDate(categoryPaths, "productionDate", metadata.getProductionDate()); addFacetDate(categoryPaths, "distributionDate", metadata.getDistributionDate()); for (Iterator it = studyDistributors.iterator(); it.hasNext();) { StudyDistributor studyDistributor = (StudyDistributor) it.next(); addFacet(categoryPaths, "distributorName", studyDistributor.getName()); } for (Iterator it = studyAuthors.iterator(); it.hasNext();) { StudyAuthor elem = (StudyAuthor) it.next(); addFacet(categoryPaths, "authorName", elem.getName()); addFacet(categoryPaths, "authorAffiliation", elem.getAffiliation()); } addFacet(categoryPaths, "country", metadata.getCountry()); for (Iterator it = keywords.iterator(); it.hasNext();) { StudyKeyword elem = (StudyKeyword) it.next(); addFacet(categoryPaths, "keywordValue", elem.getValue()); } for (Iterator it = topicClassifications.iterator(); it.hasNext();) { StudyTopicClass elem = (StudyTopicClass) it.next(); if (elem.getValue() != null && (!elem.getValue().equals("")) && elem.getVocab() != null && (!elem.getVocab().equals(""))) { addFacet(categoryPaths, "topicClassValueParensVocab", elem.getValue().trim() + " (" + elem.getVocab().trim() + ")"); } } CategoryDocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo); categoryDocBuilder.setCategoryPaths(categoryPaths); categoryDocBuilder.build(doc); writer.addDocument(doc); // warnings from https://svn.apache.org/repos/asf/lucene/dev/tags/lucene_solr_3_5_0/lucene/contrib/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleIndexer.java // we commit changes to the taxonomy index prior to committing them to the search index. // this is important, so that all facets referred to by documents in the search index // will indeed exist in the taxonomy index. taxo.commit(); writer.commit(); // close the taxonomy index and the index - all modifications are // now safely in the provided directories: indexDir and taxoDir. taxo.close(); writer.close(); writerVar = new IndexWriter(dir, getAnalyzer(), isIndexEmpty(), IndexWriter.MaxFieldLength.UNLIMITED); StudyFile studyFile = null; DataTable dataTable = null; List<DataVariable> dataVariables = null; for (FileMetadata fileMetadata : sv.getFileMetadatas()) { //TODO: networkDataFile studyFile = fileMetadata.getStudyFile(); if (studyFile instanceof TabularDataFile) { dataTable = ((TabularDataFile) studyFile).getDataTable(); if (dataTable != null) { dataVariables = dataTable.getDataVariables(); for (int j = 0; j < dataVariables.size(); j++) { Document docVariables = new Document(); addText(1.0f, docVariables, "varStudyId", study.getId().toString()); addText(1.0f, docVariables, "varStudyFileId", studyFile.getId().toString()); DataVariable dataVariable = dataVariables.get(j); addText(1.0f, docVariables, "varId", dataVariable.getId().toString()); addText(1.0f, docVariables, "varName", dataVariable.getName()); addText(1.0f, docVariables, "varLabel", dataVariable.getLabel()); writerVar.addDocument(docVariables); } dataVariables = null; dataTable = null; } } studyFile = null; } writerVar.close(); writerFileMeta = new IndexWriter(dir, getAnalyzer(), isIndexEmpty(), IndexWriter.MaxFieldLength.UNLIMITED); for (FileMetadata fileMetadata : sv.getFileMetadatas()) { studyFile = fileMetadata.getStudyFile(); if (studyFile instanceof SpecialOtherFile) { Document docFileMetadata = new Document(); // the "id" is the database id of the *study*; - for // compatibility with the study-level index files. addKeyword(docFileMetadata, "id", study.getId().toString()); addText(1.0f, docFileMetadata, "studyFileId", studyFile.getId().toString()); List<FileMetadataFieldValue> fileMetadataFieldValues = fileMetadata.getStudyFile() .getFileMetadataFieldValues(); for (int j = 0; j < fileMetadataFieldValues.size(); j++) { String fieldValue = fileMetadataFieldValues.get(j).getStrValue(); FileMetadataField fmf = fileMetadataFieldValues.get(j).getFileMetadataField(); String fileMetadataFieldName = fmf.getName(); String fileMetadataFieldFormatName = fmf.getFileFormatName(); String indexFileName = fileMetadataFieldFormatName + "-" + fileMetadataFieldName; addText(1.0f, docFileMetadata, indexFileName, fieldValue); } writerFileMeta.addDocument(docFileMetadata); } studyFile = null; } writerFileMeta.close(); writerVersions = new IndexWriter(dir, new WhitespaceAnalyzer(), isIndexEmpty(), IndexWriter.MaxFieldLength.UNLIMITED); for (StudyVersion version : study.getStudyVersions()) { // The current(released) version UNF is indexed in the main document // only index previous(archived) version UNFs here if (version.isArchived()) { Document docVersions = new Document(); addKeyword(docVersions, "versionStudyId", study.getId().toString()); addText(1.0f, docVersions, "versionId", version.getId().toString()); addText(1.0f, docVersions, "versionNumber", version.getVersionNumber().toString()); addKeyword(docVersions, "versionUnf", version.getMetadata().getUNF()); writerVersions.addDocument(docVersions); } } writerVersions.close(); logger.fine("End indexing study " + study.getStudyId()); } }
From source file:org.wso2.carbon.analytics.dataservice.core.indexing.AnalyticsDataIndexer.java
License:Open Source License
private void updateIndex(int shardIndex, List<Record> recordBatch, Map<String, ColumnDefinition> columns) throws AnalyticsIndexException { if (log.isDebugEnabled()) { log.debug("Updating data in local index [" + shardIndex + "]: " + recordBatch.size()); }/*w w w .j av a 2 s . c om*/ Record firstRecord = recordBatch.get(0); int tenantId = firstRecord.getTenantId(); String tableName = firstRecord.getTableName(); String tableId = this.generateTableId(tenantId, tableName); IndexWriter indexWriter = this.lookupIndexWriter(shardIndex, tableId); TaxonomyWriter taxonomyWriter = this.lookupTaxonomyIndexWriter(shardIndex, tableId); try { for (Record record : recordBatch) { indexWriter.updateDocument(new Term(INDEX_ID_INTERNAL_FIELD, record.getId()), this.generateIndexDoc(record, columns, taxonomyWriter).getFields()); } indexWriter.commit(); taxonomyWriter.commit(); if (this.isIndexingStatsEnabled()) { this.statsCollector.processedRecords(recordBatch.size()); } } catch (IOException e) { throw new AnalyticsIndexException("Error in updating index: " + e.getMessage(), e); } }