List of usage examples for org.apache.lucene.index IndexWriter close
@Override public void close() throws IOException
From source file:com.jaeksoft.searchlib.index.WriterLocal.java
License:Open Source License
private void close(IndexWriter indexWriter) { if (indexWriter == null) return;// ww w . j a v a 2s .c om try { indexWriter.close(); } catch (Exception e) { Logging.warn(e); } finally { indexDirectory.unlock(); } }
From source file:com.jaeksoft.searchlib.index.WriterLucene.java
License:Open Source License
private void close(IndexWriter indexWriter) { if (indexWriter == null) return;/* ww w. j a v a 2 s . c o m*/ try { indexWriter.close(); } catch (Throwable e) { Logging.warn(e); } finally { indexDirectory.unlock(); } }
From source file:com.javapr.plaintextindex.search.Index.java
License:Apache License
public void erstelleIndex() throws SAXException, TikaException { frame = new JFrame(); frame.setTitle("Lucene Suche"); frame.setSize(400, 280);/* ww w . j a va 2 s . c om*/ frame.setLocation(400, 400); frame.setVisible(true); panel = new JPanel(new BorderLayout(5, 5)); panel1 = new JPanel(new GridLayout(1, 1)); panelLabel = new JPanel(new GridLayout(1, 1)); pane = new JScrollPane(panel1); liste = new JList(); doc = new JLabel(); indexdir = new JLabel(); panel.add(panelLabel, BorderLayout.NORTH); panel.add(pane, BorderLayout.CENTER); panel.setLayout(new GridLayout(2, 1)); frame.add(panel); //berprfen ob Verzeichnis mit Text-Files vorhanden ist final File docDir = new File(Prop.Filestoindex(null)); if (!docDir.exists() || !docDir.canRead()) { doc = new JLabel("Dokumenten Verzeichnis '" + docDir.getAbsolutePath() + "' kann nicht gelesen werden, bitte Pfad berprfen!"); panelLabel.add(doc); } //Textdateien einlesen if ((docDir.exists() || docDir.canRead())) { Date start = new Date(); try { indexdir = new JLabel("Index Verzeichnis: '" + Prop.Indexdirectory(null) + "'..."); panelLabel.add(indexdir); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_42, analyzer); Directory directory = FSDirectory.open(new File(Prop.Indexdirectory(null))); IndexWriter writer = new IndexWriter(directory, iwc); //Methodenaufruf indexDocs und schreiben in Index-Verzeichnis indexDocs(writer, docDir); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " millisekunden"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "with message: " + e.getMessage()); } } liste = new JList(list.toArray()); panel1.add(liste); }
From source file:com.jivesoftware.forum.database.DbSearchManager.java
License:Open Source License
public synchronized void addToIndex(ForumMessage message) { if (!searchEnabled) { return;/* w w w . ja v a 2 s. com*/ } IndexWriter writer = null; try { writer = getWriter(false); long messageID = message.getID(); long userID = -1; if (!message.isAnonymous()) { userID = message.getUser().getID(); } long threadID = message.getForumThread().getID(); long forumID = message.getForumThread().getForum().getID(); String subject = message.getUnfilteredSubject(); String body = message.getUnfilteredBody(); addMessageToIndex(messageID, userID, threadID, forumID, subject, body, message.getCreationDate(), writer); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { writer.close(); } catch (Exception e) { } } }
From source file:com.jivesoftware.forum.database.DbSearchManager.java
License:Open Source License
/** * Rebuilds the search index from scratch. It deletes the entire index * and word tables and then indexes every message up to the end time. *//* w w w . ja v a2s .co m*/ private final void rebuildIndex(Date end) { Connection con = null; PreparedStatement pstmt = null; IndexWriter writer = null; String endDate = StringUtils.dateToMillis(end); try { writer = getWriter(true); con = ConnectionManager.getConnection(); // Get a count of the messages that will be added to index. pstmt = con.prepareStatement(MESSAGES_BEFORE_DATE_COUNT); pstmt.setString(1, endDate); ResultSet rs = pstmt.executeQuery(); rs.next(); // Set the total count so that we can keep track of percentage // complete. totalCount = rs.getInt(1); pstmt.close(); // Some JDBC drivers don't stream results, but read all records // returned by a query into memory. Most notably, the current // MySQL JDBC drivers have this problem. As a workaround, we read // message data in blocks (default 500). // First, get the highest messageID in the database. pstmt = con.prepareStatement(HIGHEST_MESSAGE_ID); rs = pstmt.executeQuery(); rs.next(); long sentinal = (long) (rs.getLong(1) / BLOCK_SIZE); pstmt.close(); // Now, use the messageIDs to select blocks of message data to add // to the index at a time. pstmt = con.prepareStatement(MESSAGES_BEFORE_DATE); for (int i = 0; i <= sentinal; i++) { pstmt.setLong(1, BLOCK_SIZE * i); pstmt.setLong(2, BLOCK_SIZE * (i + 1)); pstmt.setString(3, endDate); rs = pstmt.executeQuery(); while (rs.next()) { // Increment current count to track percentage complete. currentCount++; long messageID = rs.getLong(1); long userID = rs.getLong(2); long threadID = rs.getLong(3); long forumID = rs.getLong(4); String subject = rs.getString(5); String body = rs.getString(6); Date creationDate = new Date(Long.parseLong(rs.getString(7).trim())); addMessageToIndex(messageID, userID, threadID, forumID, subject, body, creationDate, writer); } rs.close(); writer.optimize(); } } catch (Exception e) { e.printStackTrace(); } finally { try { pstmt.close(); } catch (Exception e) { e.printStackTrace(); } try { con.close(); } catch (Exception e) { e.printStackTrace(); } try { writer.close(); } catch (Exception e) { e.printStackTrace(); } } }
From source file:com.jivesoftware.forum.database.DbSearchManager.java
License:Open Source License
/** * Updates the index. It first deletes any messages in the index between * the start and end times, and then adds all messages to the index that * are between the start and end times./*from ww w . j ava 2s. c o m*/ */ protected final void updateIndex(Date start, Date end) { Connection con = null; PreparedStatement pstmt = null; IndexWriter writer = null; LongList messages = new LongList(); try { con = ConnectionManager.getConnection(); // For a clean update, we need to make sure that we first delete // any index entries that were made since we last updated. This // might happen if a process was calling indexMessage() between runs // of this method. For this reason, the two types of indexing (manual // and automatic) should not be intermixed. However, we still perform // this deletion to be safe. pstmt = con.prepareStatement(MESSAGE_IDS_SINCE_DATE); pstmt.setString(1, StringUtils.dateToMillis(start)); pstmt.setString(2, StringUtils.dateToMillis(end)); ResultSet rs = pstmt.executeQuery(); while (rs.next()) { messages.add(rs.getLong(1)); } pstmt.close(); // Now, delete those messages deleteMessagesFromIndex(messages.toArray()); // Get a count of the messages that will be added to index. pstmt = con.prepareStatement(MESSAGES_SINCE_DATE_COUNT); pstmt.setString(1, StringUtils.dateToMillis(start)); pstmt.setString(2, StringUtils.dateToMillis(end)); rs = pstmt.executeQuery(); rs.next(); // Set the total count so that we can keep track of percentage // complete. totalCount = rs.getInt(1); pstmt.close(); // Add the messages to the index. writer = getWriter(false); pstmt = con.prepareStatement(MESSAGES_SINCE_DATE); pstmt.setString(1, StringUtils.dateToMillis(start)); pstmt.setString(2, StringUtils.dateToMillis(end)); rs = pstmt.executeQuery(); while (rs.next()) { // Increment current count to track percentage complete. currentCount++; long messageID = rs.getLong(1); long userID = rs.getLong(2); long threadID = rs.getLong(3); long forumID = rs.getLong(4); String subject = rs.getString(5); String body = rs.getString(6); Date creationDate = new Date(Long.parseLong(rs.getString(7).trim())); addMessageToIndex(messageID, userID, threadID, forumID, subject, body, creationDate, writer); } } catch (Exception e) { e.printStackTrace(); } finally { try { pstmt.close(); } catch (Exception e) { e.printStackTrace(); } try { con.close(); } catch (Exception e) { e.printStackTrace(); } try { writer.close(); } catch (Exception e) { e.printStackTrace(); } } }
From source file:com.justinleegrant.myluceneplayground.SimpleFacetsExample.java
License:Apache License
/** Build the example index. */ private void index() throws IOException { IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig(Version.LATEST, new WhitespaceAnalyzer()).setOpenMode(OpenMode.CREATE)); // Writes facet ords to a separate directory from the main index DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); Document doc = new Document(); doc.add(new FacetField("Author", "Bob")); doc.add(new FacetField("Publish Date", "2010", "10", "15")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Lisa")); doc.add(new FacetField("Publish Date", "2010", "10", "20")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Lisa")); doc.add(new FacetField("Publish Date", "2012", "1", "1")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Susan")); doc.add(new FacetField("Publish Date", "2012", "1", "7")); indexWriter.addDocument(config.build(taxoWriter, doc)); doc = new Document(); doc.add(new FacetField("Author", "Frank")); doc.add(new FacetField("Publish Date", "1999", "5", "5")); indexWriter.addDocument(config.build(taxoWriter, doc)); indexWriter.close(); taxoWriter.close();/* w w w. j av a 2 s . c o m*/ }
From source file:com.khepry.frackhem.entities.Blendeds.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;/* w w w. j ava 2 s.c o m*/ message = "Start Indexing Blendeds via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Blendeds", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Blendeds via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Chemicals.java
License:Apache License
public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName, Map<String, Toxicity> toxicities) throws IOException { String message;/*from w w w . j av a 2s . com*/ message = "Start Indexing Chemicals via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textFilePath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } else { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } else { deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } } if (indexFolder.exists() && taxonomyFolder.exists()) { List<String> colHeaders = new ArrayList<>(); Map<String, Integer> colIndexes = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, indexField); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; Integer rcdCount = 0; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { colHeaders.add(colHeader.trim()); colIndexes.put(colHeader, i); } } else { if (pieces.length == colHeaders.size()) { sb.setLength(0); Document document = new Document(); for (int i = 0; i < pieces.length; i++) { Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES); document.add(field); if (mapIndexFields.containsKey(colHeaders.get(i))) { if (!pieces[i].trim().equals("")) { sb.append(pieces[i].trim()); sb.append(" "); } } } // append toxicity information to the document String toxCasEdfId = document.get(casEdfIdFieldName).trim(); Toxicity toxicity = new Toxicity(); if (toxicities.containsKey(toxCasEdfId)) { toxicity = toxicities.get(toxCasEdfId); document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(), Store.YES)); sb.append(toxicity.getToxChemicalName().trim()); sb.append(" "); document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(), Store.YES)); sb.append(toxicity.getToxRecognized().trim()); sb.append(" "); document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(), Store.YES)); sb.append(toxicity.getToxSuspected().trim()); sb.append(" "); } else { document.add(new TextField("toxChemicalName", "", Store.YES)); document.add(new TextField("toxRecognized", "", Store.YES)); document.add(new TextField("toxSuspected", "", Store.YES)); } Field field = new TextField("text", sb.toString().trim(), Store.NO); document.add(field); String toxChemical = toxicity.getToxChemicalName().trim(); // categorize recognized toxicities String toxRecognized = toxicity.getToxRecognized().trim(); if (!toxRecognized.equals("")) { taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxRecognized.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories .add(new CategoryPath("toxRecognized", "Toxicity", value)); } } } // categorize suspected toxicities String toxSuspected = toxicity.getToxSuspected().trim(); if (!toxSuspected.equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId)); taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical", toxChemical.replace("/", "|"))); for (String value : toxSuspected.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value)); } } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapIndexFields.containsKey(statsKey)) { String fieldValue = mapIndexFields.get(statsKey); if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Chemicals", statsKey, fieldValue)); } } } if (taxonomyCategories.size() > 0) { facetFields.addFields(document, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(document); if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } taxonomyCategories.clear(); } } } br.close(); message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sb.setLength(0); sb.trimToSize(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); taxonomyWriter.commit(); taxonomyWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Chemicals via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }
From source file:com.khepry.frackhem.entities.Reports.java
License:Apache License
public void indexViaLucene(String textPath, String textColSeparator, Map<String, Toxicity> toxicities, String... parseFields) throws IOException { String message;/*from w w w.jav a2 s . co m*/ message = "Start Indexing Reports via Lucene..."; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } File textFile = new File(textPath); if (textFile.exists()) { File indexFolder = new File(indexFolderPath); if (!indexFolder.exists()) { indexFolder.mkdir(); } File taxonomyFolder = new File(taxonomyFolderPath); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } if (indexFolder.exists() && taxonomyFolder.exists()) { deleteFolder(indexFolder); if (!indexFolder.exists()) { indexFolder.mkdir(); } deleteFolder(taxonomyFolder); if (!taxonomyFolder.exists()) { taxonomyFolder.mkdir(); } Map<String, String> mapBreakFields = new LinkedHashMap<>(); Map<String, String> mapIndexFields = new LinkedHashMap<>(); Map<String, String> mapLevelFields = new LinkedHashMap<>(); Map<String, String> mapStatsFields = new LinkedHashMap<>(); Map<String, Integer> mapColIndexes = new LinkedHashMap<>(); String[] pieces; String[] tuples; pieces = indexFields.split(","); for (String indexField : pieces) { mapIndexFields.put(indexField, ""); } pieces = levelFields.split(","); for (String levelField : pieces) { mapBreakFields.put(levelField, ""); mapLevelFields.put(levelField, ""); } pieces = statsFields.split(","); for (String statField : pieces) { tuples = statField.split(":"); mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]); } Map<String, Map<String, String>> mapToxValues = new LinkedHashMap<>(); for (String parseField : parseFields) { mapToxValues.put(parseField, new TreeMap<String, String>()); } SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder); SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE); FacetFields facetFields = new FacetFields(taxonomyWriter); List<CategoryPath> taxonomyCategories = new ArrayList<>(); String line; StringBuilder sbIndex = new StringBuilder(); StringBuilder sbLevel = new StringBuilder(); Integer outCount = 0; Integer rcdCount = 0; Boolean firstDataRecordHandled = false; BufferedReader br = new BufferedReader(new FileReader(textFile)); while ((line = br.readLine()) != null) { rcdCount++; pieces = line.split(textColSeparator); if (rcdCount == 1) { int i = 0; for (String colHeader : pieces) { mapColIndexes.put(colHeader.trim(), i); i++; } } else { for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); // build up level-break values if (mapLevelFields.containsKey(key)) { mapLevelFields.put(key, value); } } } if (!firstDataRecordHandled) { mapBreakFields.putAll(mapLevelFields); firstDataRecordHandled = true; } // if there is a "level break" if (!mapLevelFields.equals(mapBreakFields)) { Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories .add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; sbIndex.setLength(0); for (String key : mapToxValues.keySet()) { mapToxValues.get(key).clear(); } taxonomyCategories.clear(); mapBreakFields.putAll(mapLevelFields); } // build up text index values for (String key : mapLevelFields.keySet()) { if (mapColIndexes.containsKey(key)) { String value = pieces[mapColIndexes.get(key)].trim(); if (!value.equals("")) { // build up 'text' field index value if (mapIndexFields.containsKey(key)) { sbIndex.append(value); sbIndex.append(" "); } } } } // build up toxicity values for later level-break use if (mapColIndexes.containsKey(casEdfIdFieldName)) { Toxicity toxicity = toxicities.get(pieces[mapColIndexes.get(casEdfIdFieldName)].trim()); if (toxicity != null) { // build up recognized toxicity values String[] toxRValues = toxicity.getToxRecognized().split(","); for (String toxValue : toxRValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxRecognized").containsKey(toxValue)) { mapToxValues.get("toxRecognized").put(toxValue, toxValue); } } } // build up suspected toxicity values String[] toxSValues = toxicity.getToxSuspected().split(","); for (String toxValue : toxSValues) { if (!toxValue.equals("")) { if (!mapToxValues.get("toxSuspected").containsKey(toxValue)) { mapToxValues.get("toxSuspected").put(toxValue, toxValue); } } } } } if (progressInterval > 0 && rcdCount % progressInterval == 0) { message = "Records indexed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } } } br.close(); // handle end-of-file processing Document tgtDocument = new Document(); for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) { Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES); tgtDocument.add(field); } for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) { String fieldName = toxEntry.getKey(); String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " "); // System.out.println(fieldName + ": " + fieldValue); sbIndex.append(fieldValue); sbIndex.append(" "); tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES)); // build up "Toxicity" taxonomy categories for (String value : fieldValue.replace(" ", ",").split(",")) { if (!value.trim().equals("")) { taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value)); } } // build up "stats" taxonomy categories for (String statsKey : mapStatsFields.keySet()) { if (mapLevelFields.containsKey(statsKey)) { String levelValue = mapLevelFields.get(statsKey); if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) { taxonomyCategories.add(new CategoryPath("Reports", statsKey, levelValue)); } } } } tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO)); if (taxonomyCategories.size() > 0) { facetFields.addFields(tgtDocument, taxonomyCategories); // System.out.println("Taxonomies added: " + // taxonomyCategories.size()); } indexWriter.addDocument(tgtDocument); outCount++; message = "Records processed: " + rcdCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } message = "Records indexed: " + outCount; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } sbIndex.setLength(0); sbIndex.trimToSize(); sbLevel.setLength(0); sbLevel.trimToSize(); mapToxValues.clear(); indexWriter.commit(); indexWriter.forceMerge(1); indexWriter.close(); analyzer.close(); indexDirectory.close(); taxonomyWriter.commit(); taxonomyWriter.close(); taxonomyDirectory.close(); } else { message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder + " does not exist!"; if (outputToSystemErr) { System.err.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } message = "Ended Indexing Reports via Lucene!"; if (outputToSystemOut) { System.out.println(message); } if (outputToMsgQueue) { progressMessageQueue.send(new MessageInput(message)); } } }