List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:com.intuit.tank.search.lucene.LuceneService.java
License:Open Source License
/** * Indexes a list of documents./* w w w . j a v a2 s. c o m*/ * * @param docs */ public void indexDocuments(List<Document> docs) { IndexWriter writer = getWriter(); for (Document document : docs) { try { writer.addDocument(document); } catch (Exception e) { e.printStackTrace(); closeWriter(writer); throw new RuntimeException(e); } } closeWriter(writer); }
From source file:com.ivannotes.searchbee.SearchBee.java
License:Apache License
public final void doIndex(DataFetcher<T> df) throws CorruptIndexException, IOException { df.reset();// ww w . ja va 2 s . co m IndexWriter idxWriter = getIndexWriter(); int contiuousException = 0; try { while (df.hasMore()) { try { List<T> data = df.fetchData(); for (T bean : data) { Document doc = buildDocument(bean); idxWriter.addDocument(doc); } idxWriter.commit(); contiuousException = 0; } catch (Exception e) { contiuousException++; logger.error("build index error", e); if (contiuousException > 100) { logger.error("build index exceed max continuous exception count(100), exit build."); break; } } } } finally { if (null != idxWriter) { idxWriter.close(); } } }
From source file:com.jaeksoft.searchlib.index.WriterLocal.java
License:Open Source License
@Deprecated public void addDocument(Document document) throws IOException, SearchLibException { IndexWriter indexWriter = null; try {/* w w w .j av a 2s . com*/ indexWriter = open(); indexWriter.addDocument(document); } finally { close(indexWriter); } }
From source file:com.jaeksoft.searchlib.index.WriterLucene.java
License:Open Source License
@Deprecated public void addDocument(Document document) throws IOException, SearchLibException { IndexWriter indexWriter = null; lock.rl.lock();//from w w w. j av a 2 s .c om try { indexWriter = open(); indexWriter.addDocument(document); close(indexWriter); indexWriter = null; } finally { lock.rl.unlock(); close(indexWriter); } }
From source file:com.javapr.plaintextindex.search.Index.java
License:Apache License
public static void indexDocs(IndexWriter writer, File file) throws IOException, SAXException, TikaException { // nur lesbare Dateien verwenden if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); }//from w ww. j a va 2 s .com } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { return; } try { //Word Dokumente mit Tika parsen ContentHandler contenthandler = new BodyContentHandler(); Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName()); Parser parser = new AutoDetectParser(); parser.parse(fis, contenthandler, metadata, new ParseContext()); // Lucene Dokumenten-Objekt erstellen und geparsten Tika-Inhalt speichern Document doc = new Document(); Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); Field filename = new StringField("filename", file.getName(), Field.Store.YES); doc.add(filename); doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); doc.add(new TextField("contents", contenthandler.toString(), Field.Store.NO)); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { //neuer Index, wenn neues Dokument System.out.println("adding " + file); writer.addDocument(doc); } else { long size = file.length() / 1024; list.add(file + ", " + size + "kb"); //Index updaten, wenn lteres Index-Dokument schon vorhanden System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:com.jivesoftware.forum.database.DbSearchManager.java
License:Open Source License
/** * Indexes an indivual message. The writer is assumed to be open when * passed in and will remain open after the method is done executing. *///from w w w . j a v a 2s . co m protected final void addMessageToIndex(long messageID, long userID, long threadID, long forumID, String subject, String body, java.util.Date creationDate, IndexWriter writer) throws IOException { if (writer == null) { return; } Document doc = new Document(); doc.add(Field.Keyword("messageID", Long.toString(messageID))); doc.add(new Field("userID", Long.toString(userID), false, true, false)); doc.add(new Field("threadID", Long.toString(threadID), false, true, false)); doc.add(new Field("forumID", Long.toString(forumID), false, true, false)); doc.add(Field.UnStored("subject", subject)); doc.add(Field.UnStored("body", body)); doc.add(new Field("creationDate", DateField.dateToString(creationDate), false, true, false)); writer.addDocument(doc); }
From source file:com.justinleegrant.myluceneplayground.SimpleFacetsExample.java
License:Apache License
/** Build the example index. */ private void index() throws IOException { IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig(Version.LATEST, new WhitespaceAnalyzer()).setOpenMode(OpenMode.CREATE)); // Writes facet ords to a separate directory from the main index DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); Document doc = new Document(); doc.add(new FacetField("Author", "Bob")); doc.add(new FacetField("Publish Date", "2010", "10", "15")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Lisa")); doc.add(new FacetField("Publish Date", "2010", "10", "20")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Lisa")); doc.add(new FacetField("Publish Date", "2012", "1", "1")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Susan")); doc.add(new FacetField("Publish Date", "2012", "1", "7")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Frank")); doc.add(new FacetField("Publish Date", "1999", "5", "5")); indexWriter.addDocument(config.build(taxoWriter, doc)); indexWriter.close();//from ww w . jav a 2 s.c om taxoWriter.close(); }
From source file:com.khepry.frackhem.entities.Blendeds.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;/*from w w w .j av a 2s . com*/ message = "Start Indexing Blendeds via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Blendeds", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Blendeds via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Chemicals.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;/* w w w . j a v a 2s . co m*/ message = "Start Indexing Chemicals via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Chemicals", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Chemicals via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Reports.java
License:Apache License
public void indexViaLucene(String textPath, String textColSeparator, Map<String, Toxicity> toxicities, String... parseFields) throws IOException { String message;/* w w w. j a va 2s .c om*/ message = "Start Indexing Reports via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textPath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } if (indexFolder.exists() && taxonomyFolder.exists()) { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } Map<String, String> mapBreakFields = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapLevelFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); Map<String, Integer> mapColIndexes = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, ""); } pieces = levelFields.split(","); for (String levelField : pieces) { mapBreakFields.put(levelField, ""); mapLevelFields.put(levelField, ""); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } Map<String, Map<String, String>> mapToxValues = new LinkedHashMap<>(); for (String parseField : parseFields) { mapToxValues.put(parseField, new TreeMap<String, String>()); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; StringBuilder sbIndex = new StringBuilder(); StringBuilder sbLevel = new StringBuilder(); Integer outCount = 0; Integer rcdCount = 0; Boolean firstDataRecordHandled = false; BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { mapColIndexes.put(colHeader.trim(), i); i++; } } else { for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); // build up level-break values if (mapLevelFields.containsKey(key)) { mapLevelFields.put(key, value); } } } if (!firstDataRecordHandled) { mapBreakFields.putAll(mapLevelFields); firstDataRecordHandled = true; } // if there is a "level break" if (!mapLevelFields.equals(mapBreakFields)) { Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories .add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; sbIndex.setLength(0); for (String key : mapToxValues.keySet()) { mapToxValues.get(key).clear(); } taxonomyCategories.clear(); mapBreakFields.putAll(mapLevelFields); } // build up text index values for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); if (!value.equals("")) { // build up 'text' field index value if (mapIndexFields.containsKey(key)) { sbIndex.append(value); sbIndex.append(" "); } } } } // build up toxicity values for later level-break use if (mapColIndexes.containsKey(casEdfIdFieldName)) { Toxicity toxicity = toxicities.get(pieces[mapColIndexes.get(casEdfIdFieldName)].trim()); if (toxicity != null) { // build up recognized toxicity values String[] toxRValues = toxicity.getToxRecognized().split(","); for (String toxValue : toxRValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxRecognized").containsKey(toxValue)) { mapToxValues.get("toxRecognized").put(toxValue, toxValue); } } } // build up suspected toxicity values String[] toxSValues = toxicity.getToxSuspected().split(","); for (String toxValue : toxSValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxSuspected").containsKey(toxValue)) { mapToxValues.get("toxSuspected").put(toxValue, toxValue); } } } } } if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } } } br.close(); // handle end-of-file processing Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; message = "Records processed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } message = "Records indexed: " + outCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sbIndex.setLength(0); sbIndex.trimToSize(); sbLevel.setLength(0); sbLevel.trimToSize(); mapToxValues.clear(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyWriter.commit(); taxonomyWriter.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Reports via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }