List of usage examples for org.apache.lucene.index IndexWriter commit
@Override public final long commit() throws IOException
Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.
From source file:com.khepry.frackhem.entities.Blendeds.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;// w w w . java2 s .co m message = "Start Indexing Blendeds via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Blendeds", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Blendeds via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Chemicals.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;/* w w w. j a va2s .c o m*/ message = "Start Indexing Chemicals via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Chemicals", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Chemicals via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Reports.java
License:Apache License
public void indexViaLucene(String textPath, String textColSeparator, Map<String, Toxicity> toxicities, String... parseFields) throws IOException { String message;/*from w ww .ja va2 s .c o m*/ message = "Start Indexing Reports via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textPath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } if (indexFolder.exists() && taxonomyFolder.exists()) { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } Map<String, String> mapBreakFields = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapLevelFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); Map<String, Integer> mapColIndexes = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, ""); } pieces = levelFields.split(","); for (String levelField : pieces) { mapBreakFields.put(levelField, ""); mapLevelFields.put(levelField, ""); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } Map<String, Map<String, String>> mapToxValues = new LinkedHashMap<>(); for (String parseField : parseFields) { mapToxValues.put(parseField, new TreeMap<String, String>()); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; StringBuilder sbIndex = new StringBuilder(); StringBuilder sbLevel = new StringBuilder(); Integer outCount = 0; Integer rcdCount = 0; Boolean firstDataRecordHandled = false; BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { mapColIndexes.put(colHeader.trim(), i); i++; } } else { for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); // build up level-break values if (mapLevelFields.containsKey(key)) { mapLevelFields.put(key, value); } } } if (!firstDataRecordHandled) { mapBreakFields.putAll(mapLevelFields); firstDataRecordHandled = true; } // if there is a "level break" if (!mapLevelFields.equals(mapBreakFields)) { Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories .add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; sbIndex.setLength(0); for (String key : mapToxValues.keySet()) { mapToxValues.get(key).clear(); } taxonomyCategories.clear(); mapBreakFields.putAll(mapLevelFields); } // build up text index values for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); if (!value.equals("")) { // build up 'text' field index value if (mapIndexFields.containsKey(key)) { sbIndex.append(value); sbIndex.append(" "); } } } } // build up toxicity values for later level-break use if (mapColIndexes.containsKey(casEdfIdFieldName)) { Toxicity toxicity = toxicities.get(pieces[mapColIndexes.get(casEdfIdFieldName)].trim()); if (toxicity != null) { // build up recognized toxicity values String[] toxRValues = toxicity.getToxRecognized().split(","); for (String toxValue : toxRValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxRecognized").containsKey(toxValue)) { mapToxValues.get("toxRecognized").put(toxValue, toxValue); } } } // build up suspected toxicity values String[] toxSValues = toxicity.getToxSuspected().split(","); for (String toxValue : toxSValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxSuspected").containsKey(toxValue)) { mapToxValues.get("toxSuspected").put(toxValue, toxValue); } } } } } if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } } } br.close(); // handle end-of-file processing Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; message = "Records processed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } message = "Records indexed: " + outCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sbIndex.setLength(0); sbIndex.trimToSize(); sbLevel.setLength(0); sbLevel.trimToSize(); mapToxValues.clear(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyWriter.commit(); taxonomyWriter.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Reports via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Toxicities.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator) throws IOException { String message;// w w w . j a v a2 s .c o m message = "Start Indexing Toxicities via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxCasEdfId = pieces[0].trim(); String toxChemical = pieces[1].trim(); // categorize recognized toxicities String toxRecognized = pieces[2].trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = pieces[3].trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories .add(new CategoryPath("Toxicities", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } } message = "Ended Indexing Toxicities via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.liferay.portal.search.lucene.dump.DumpIndexDeletionPolicy.java
License:Open Source License
public void dump(OutputStream outputStream, IndexWriter indexWriter, Lock commitLock) throws IOException { IndexCommit indexCommit = null;// w w w . jav a2 s . co m String segmentsFileName = null; commitLock.lock(); try { indexWriter.commit(); indexCommit = _lastIndexCommit; segmentsFileName = indexCommit.getSegmentsFileName(); _segmentsFileNames.add(segmentsFileName); } finally { commitLock.unlock(); } try { IndexCommitSerializationUtil.serializeIndex(indexCommit, outputStream); } finally { _segmentsFileNames.remove(segmentsFileName); } }
From source file:com.lucid.solr.sidecar.SidecarIndexReaderFactory.java
License:Apache License
DirectoryReader buildParallelReader(DirectoryReader main, SolrIndexSearcher source, boolean rebuild) { try {/* w ww .j a va 2 s .c o m*/ if (source == null) { throw new Exception("Source collection is missing."); } // create as a sibling path of the main index Directory d = main.directory(); File primaryDir = null; if (d instanceof FSDirectory) { String path = ((FSDirectory) d).getDirectory().getPath(); primaryDir = new File(path); sidecarIndex = new File(primaryDir.getParentFile(), sidecarIndexLocation); } else { String secondaryPath = System.getProperty("java.io.tmpdir") + File.separator + sidecarIndexLocation + "-" + System.currentTimeMillis(); sidecarIndex = new File(secondaryPath); } // create a new tmp dir for the secondary indexes File secondaryIndex = new File(sidecarIndex, System.currentTimeMillis() + "-index"); if (rebuild) { safeDelete(sidecarIndex); } parallelFields.addAll(source.getFieldNames()); parallelFields.remove("id"); LOG.debug("building a new index"); Directory dir = FSDirectory.open(secondaryIndex); if (IndexWriter.isLocked(dir)) { // try forcing unlock try { IndexWriter.unlock(dir); } catch (Exception e) { LOG.warn("Failed to unlock " + secondaryIndex); } } int[] mergeTargets; AtomicReader[] subReaders = SidecarIndexReader.getSequentialSubReaders(main); if (subReaders == null || subReaders.length == 0) { mergeTargets = new int[] { main.maxDoc() }; } else { mergeTargets = new int[subReaders.length]; for (int i = 0; i < subReaders.length; i++) { mergeTargets[i] = subReaders[i].maxDoc(); } } Version ver = currentCore.getLatestSchema().getDefaultLuceneMatchVersion(); IndexWriterConfig cfg = new IndexWriterConfig(ver, currentCore.getLatestSchema().getAnalyzer()); //cfg.setInfoStream(System.err); cfg.setMergeScheduler(new SerialMergeScheduler()); cfg.setMergePolicy(new SidecarMergePolicy(mergeTargets, false)); IndexWriter iw = new IndexWriter(dir, cfg); LOG.info("processing " + main.maxDoc() + " docs / " + main.numDeletedDocs() + " dels in main index"); int boostedDocs = 0; Bits live = MultiFields.getLiveDocs(main); int targetPos = 0; int nextTarget = mergeTargets[targetPos]; BytesRef idRef = new BytesRef(); for (int i = 0; i < main.maxDoc(); i++) { if (i == nextTarget) { iw.commit(); nextTarget = nextTarget + mergeTargets[++targetPos]; } if (live != null && !live.get(i)) { addDummy(iw); // this is required to preserve doc numbers. continue; } else { DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(docIdField); main.document(i, visitor); Document doc = visitor.getDocument(); // get docId String id = doc.get(docIdField); if (id == null) { LOG.debug("missing id, docNo=" + i); addDummy(iw); continue; } else { // find the data, if any doc = lookup(source, id, idRef, parallelFields); if (doc == null) { LOG.debug("missing boost data, docId=" + id); addDummy(iw); continue; } else { LOG.debug("adding boost data, docId=" + id + ", b=" + doc); iw.addDocument(doc); boostedDocs++; } } } } iw.close(); DirectoryReader other = DirectoryReader.open(dir); LOG.info("SidecarIndexReader with " + boostedDocs + " boosted documents."); SidecarIndexReader pr = createSidecarIndexReader(main, other, sourceCollection, secondaryIndex); return pr; } catch (Exception e) { LOG.warn("Unable to build parallel index: " + e.toString(), e); LOG.warn("Proceeding with single main index."); try { return new SidecarIndexReader(this, main, null, SidecarIndexReader.getSequentialSubReaders(main), sourceCollection, null); } catch (Exception e1) { LOG.warn("Unexpected exception, returning single main index", e1); return main; } } }
From source file:com.mathworks.xzheng.indexing.IndexingTest.java
License:Apache License
public void testDeleteAfterOptimize() throws IOException { IndexWriter writer = getWriter(); assertEquals(2, writer.numDocs());/*from w w w. ja v a2 s . c o m*/ writer.deleteDocuments(new Term("id", "1")); //writer.optimize(); //3 writer.forceMerge(1); writer.commit(); assertFalse(writer.hasDeletions()); assertEquals(1, writer.maxDoc()); //C assertEquals(1, writer.numDocs()); //C writer.close(); }
From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java
License:Apache License
/** * Split a given index into 3 indexes for training, test and cross validation tasks respectively * * @param originalIndex an {@link org.apache.lucene.index.LeafReader} on the source index * @param trainingIndex a {@link Directory} used to write the training index * @param testIndex a {@link Directory} used to write the test index * @param crossValidationIndex a {@link Directory} used to write the cross validation index * @param analyzer {@link Analyzer} used to create the new docs * @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used * @throws IOException if any writing operation fails on any of the indexes *///from ww w. j av a 2 s. c o m public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException { // create IWs for train / test / cv IDXs IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer)); IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer)); try { int size = originalIndex.maxDoc(); IndexSearcher indexSearcher = new IndexSearcher(originalIndex); TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); int b = 0; // iterate over existing documents for (ScoreDoc scoreDoc : topDocs.scoreDocs) { // create a new document for indexing Document doc = new Document(); if (fieldNames != null && fieldNames.length > 0) { for (String fieldName : fieldNames) { doc.add(new Field(fieldName, originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft)); } } else { for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) { if (storableField.readerValue() != null) { doc.add(new Field(storableField.name(), storableField.readerValue(), ft)); } else if (storableField.binaryValue() != null) { doc.add(new Field(storableField.name(), storableField.binaryValue(), ft)); } else if (storableField.stringValue() != null) { doc.add(new Field(storableField.name(), storableField.stringValue(), ft)); } else if (storableField.numericValue() != null) { doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft)); } } } // add it to one of the IDXs if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) { testWriter.addDocument(doc); } else if (cvWriter.maxDoc() < size * crossValidationRatio) { cvWriter.addDocument(doc); } else { trainingWriter.addDocument(doc); } b++; } } catch (Exception e) { throw new IOException(e); } finally { testWriter.commit(); cvWriter.commit(); trainingWriter.commit(); // close IWs testWriter.close(); cvWriter.close(); trainingWriter.close(); } }
From source file:com.mmiagency.knime.nodes.keyworddensity.util.KeywordDensityHelper.java
License:Open Source License
public void execute() throws IOException { org.jsoup.nodes.Document jdoc = null; // pull content using Jsoup if (m_content != null && !m_content.trim().isEmpty()) { jdoc = Jsoup.parse(m_content);/*from w ww . j a va2 s .co m*/ } else { Connection conn = Jsoup.connect(m_url); conn.validateTLSCertificates(false); conn.followRedirects(true); conn.userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0"); conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); conn.header("Accept-Language", "en-US,en;q=0.5"); conn.header("Accept-Encoding", "gzip, deflate"); conn.execute(); jdoc = conn.get(); } StringWriter text = new StringWriter(); if (m_includeMetaKeywords) { text.write(jdoc.select("meta[name=keywords]").attr("content")); text.write(" "); } if (m_includeMetaDescription) { text.write(jdoc.select("meta[name=description]").attr("content")); text.write(" "); } if (m_includePageTitle) { text.write(jdoc.select("title").text()); text.write(" "); } text.write(jdoc.select("body").text()); // analyze content with Lucene StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); Directory directory = new RAMDirectory(); IndexWriter indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED); Document doc = new Document(); Field textField = new Field("content", text.toString(), Field.Store.YES, Field.Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS); doc.add(textField); indexWriter.addDocument(doc); indexWriter.commit(); indexWriter.close(); IndexReader indexReader = IndexReader.open(directory, true); TermFreqVector termFreqVector = null; for (int i = 0; i < indexReader.maxDoc(); i++) { termFreqVector = indexReader.getTermFreqVector(i, "content"); String[] terms = termFreqVector.getTerms(); int[] freqs = termFreqVector.getTermFrequencies(); for (int n = 0; n < termFreqVector.size(); n++) { if (m_excludeList.contains(terms[n])) { continue; } add(terms[n], freqs[n]); } } indexReader.close(); directory.close(); // sort map by value sortMap(); }
From source file:com.netcrest.pado.index.provider.lucene.LuceneBuilderRAMDirectory.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes", "resource" }) public void buildTemporalKeys(boolean createNewDirectory) { Cache cache = CacheFactory.getAnyInstance(); Region<String, RAMDirectory> region = cache .getRegion(IndexMatrixUtil.getProperty(Constants.PROP_REGION_LUCENE)); TemporalType[] temporalTypes = GemfireTemporalManager.getAllTemporalTypes(); for (TemporalType type : temporalTypes) { IndexWriter writer = null; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); iwc.setOpenMode(OpenMode.CREATE); LuceneField luceneBuilder = new LuceneField(); LuceneSearch ls = LuceneSearch.getLuceneSearch(type.getFullPath()); StandardQueryParser parser = ls.createParser(); TemporalManager tm = TemporalManager.getTemporalManager(type.getFullPath()); try {//from www . ja va 2 s.com List<?> identityKeyList = tm.getIdentityKeyList(); if (identityKeyList.size() == 0) { continue; } RAMDirectory directory; if (createNewDirectory) { directory = new RAMDirectory(); } else { directory = region.get(type.getFullPath()); if (directory == null) { directory = new RAMDirectory(); } } writer = new IndexWriter(directory, iwc); // determine the identity key type, public fields and getters Field fields[] = null; Method getters[] = null; Class keyType = null; for (Object key : identityKeyList) { if (ReflectionHelper.isPrimitiveWrapper(key.getClass())) { fields = null; getters = null; keyType = key.getClass(); } else { fields = ReflectionHelper.getPublicFields(key.getClass()); getters = ReflectionHelper.getPublicGetters(key.getClass()); } break; } if (keyType != null) { configNumericType(parser, "IdentityKey", keyType); // primitive List<Document> docList = new ArrayList(); if (keyType == String.class) { for (Object key : identityKeyList) { // TODO: do lucene here Document doc = luceneBuilder.createDocument(); doc.add(luceneBuilder.createField("IdentityKey", key.toString())); docList.add(doc); } } else if (keyType == Integer.class) { for (Object key : identityKeyList) { // TODO: do lucene here Document doc = luceneBuilder.createDocument(); doc.add(luceneBuilder.createField("IdentityKey", (Integer) key)); docList.add(doc); } } else if (keyType == Long.class) { for (Object key : identityKeyList) { // TODO: do lucene here Document doc = luceneBuilder.createDocument(); doc.add(luceneBuilder.createField("IdentityKey", (Long) key)); docList.add(doc); } } else if (keyType == Float.class) { for (Object key : identityKeyList) { // TODO: do lucene here Document doc = luceneBuilder.createDocument(); doc.add(luceneBuilder.createField("IdentityKey", (Float) key)); docList.add(doc); } } else if (keyType == Double.class) { for (Object key : identityKeyList) { // TODO: do lucene here Document doc = luceneBuilder.createDocument(); doc.add(luceneBuilder.createField("IdentityKey", (Double) key)); docList.add(doc); } } try { writer.addDocuments(docList); } catch (Exception ex) { Logger.warning(ex); } } else { try { // fields if (fields != null && fields.length > 0) { // configure numeric types for (Field field : fields) { configNumericType(parser, field.getName(), field.getType()); } List<Document> docList = new ArrayList(); for (Object key : identityKeyList) { Document doc = luceneBuilder.createDocument(); for (Field field : fields) { Object obj = field.get(key); Class fieldType = field.getType(); if (fieldType == String.class) { doc.add(luceneBuilder.createField(field.getName(), obj.toString())); } else if (fieldType == Integer.class || fieldType == int.class) { doc.add(luceneBuilder.createField(field.getName(), (Integer) obj)); } else if (fieldType == Long.class || fieldType == long.class) { doc.add(luceneBuilder.createField(field.getName(), (Long) obj)); } else if (fieldType == Float.class || fieldType == float.class) { doc.add(luceneBuilder.createField(field.getName(), (Float) obj)); } else if (fieldType == Double.class || fieldType == double.class) { doc.add(luceneBuilder.createField(field.getName(), (Double) obj)); } else if (fieldType == Date.class) { doc.add(luceneBuilder.createField(field.getName(), ((Date) obj).getTime())); } } docList.add(doc); } try { writer.addDocuments(docList); } catch (Exception ex) { Logger.warning(ex); } } // getters - methods if (getters != null && getters.length > 0) { List<Document> docList = new ArrayList(); for (Object key : identityKeyList) { Document doc = luceneBuilder.createDocument(); for (Method method : getters) { // TODO: build lucene here Object obj = method.invoke(key); Class<?> fieldType = method.getReturnType(); if (fieldType == String.class) { doc.add(luceneBuilder.createField(getPropertyName(method), obj.toString())); } else if (fieldType == Integer.class || fieldType == int.class) { doc.add(luceneBuilder.createField(getPropertyName(method), (Integer) obj)); } else if (fieldType == Long.class || fieldType == long.class) { doc.add(luceneBuilder.createField(getPropertyName(method), (Long) obj)); } else if (fieldType == Float.class || fieldType == float.class) { doc.add(luceneBuilder.createField(getPropertyName(method), (Float) obj)); } else if (fieldType == Double.class || fieldType == double.class) { doc.add(luceneBuilder.createField(getPropertyName(method), (Double) obj)); } else if (fieldType == Date.class) { doc.add(luceneBuilder.createField(getPropertyName(method), ((Date) obj).getTime())); } } docList.add(doc); } try { writer.addDocuments(docList); } catch (Exception ex) { Logger.warning(ex); } } } catch (Exception ex) { Logger.warning(ex); } } writer.commit(); writer.close(); // TODO: support file system // place the RamDirectory in lucene region.put(type.getFullPath(), directory); } catch (Exception ex) { Logger.error("Index builder aborted.", ex); return; } } }