Java tutorial
/* * Leech - crawling capabilities for Apache Tika * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.lucene; import java.io.File; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.rmi.server.UID; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.UUID; import java.util.concurrent.BlockingQueue; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.LinkedBlockingQueue; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; import org.apache.tika.metadata.Metadata; import de.dfki.inquisition.collections.MultiValueHashMap; import de.dfki.inquisition.file.FileUtils; import de.dfki.inquisition.lucene.FieldConfig; import de.dfki.km.leech.Leech; import de.dfki.km.leech.metadata.LeechMetadata; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; import de.dfki.km.leech.sax.DataSinkContentHandler; /** * This is a content handler that allows to store crawled data into a Lucene index. You are able to configure the field types and the analyzers that should be used. * Further, blockindexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is supported, you can enable it with * {@link ToLuceneContentHandler#setBlockIndexing(boolean)}. If it is enabled, {@link ToLuceneContentHandler} checks whether inside the metadata is a * {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a {@link LeechMetadata#childId} entry will appear as parent documents, docs with * an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects the child documents if they appear at a processXXX method, and writes them as * block at the time a succeeding parent document appears. In the case a non-parent doc appears, all collected docs will be indexed normally, not as block. * * @author Christian Reuschling, Dipl.Ing.(BA) * */ public class ToLuceneContentHandler extends DataSinkContentHandler { protected class DocConsumer implements Runnable { @Override public void run() { try { while (true) { List<Document> llDocs = m_addDocsQueue.take(); if (llDocs instanceof InterruptThreadList) { break; } try { if (llDocs.size() == 1) { getCurrentWriter().addDocument(llDocs.get(0)); } else if (llDocs.size() > 1) { getCurrentWriter().addDocuments(llDocs); } } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.WARNING, "Error during writing a document to the index (lucene exception while addDocument) - will ignore it. This is a hint to a lucene bug." + llDocs); } } } catch (InterruptedException e) { // NOP } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error", e); } finally { try { m_cyclicBarrier4DocConsumerThreads.await(); } catch (Exception e2) { Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error", e2); } } } } protected class InterruptThreadList extends LinkedList<Document> { private static final long serialVersionUID = 196832081918659203L; } protected final BlockingQueue<List<Document>> m_addDocsQueue = new LinkedBlockingQueue<List<Document>>(23); protected boolean m_bBlockIndexing = true; protected CyclicBarrier m_cyclicBarrier4DocConsumerThreads; protected FieldConfig m_fieldConfig = new FieldConfig(); protected HashSet<String> m_hsAttNamesNot2Store = new HashSet<String>(); protected Map<String, String> m_hsFieldName2FieldValueConstraint; protected MultiValueHashMap<String, String> m_hsSource2TargetFieldnames = new MultiValueHashMap<String, String>(); protected MultiValueHashMap<String, String> m_hsStaticAttValuePairs = new MultiValueHashMap<String, String>(); protected MultiValueHashMap<String, String> m_hsTarget2SourcesFieldnames = new MultiValueHashMap<String, String>(); protected HashSet<String> m_hsTmpLuceneWriterPaths2Merge = new HashSet<String>(); protected IndexWriter m_initialLuceneWriter; protected int m_iSplitIndexDocumentCount = -1; protected LinkedList<Thread> m_llConsumerThreads = new LinkedList<Thread>(); protected LinkedList<IndexWriter> m_llIndexWriter2Close = new LinkedList<IndexWriter>(); protected LinkedList<Document> m_llLastChildDocuments = new LinkedList<Document>(); protected IndexWriter m_luceneWriter; public ToLuceneContentHandler(FieldConfig fieldConfig, IndexWriter luceneWriter) throws Exception { super(); m_fieldConfig = fieldConfig; m_luceneWriter = luceneWriter; m_initialLuceneWriter = m_luceneWriter; init(); } public ToLuceneContentHandler(int writeLimit, FieldConfig fieldConfig, IndexWriter luceneWriter) throws Exception { super(writeLimit); m_fieldConfig = fieldConfig; m_luceneWriter = luceneWriter; m_initialLuceneWriter = m_luceneWriter; init(); } public ToLuceneContentHandler(Metadata metadata, FieldConfig fieldConfig, IndexWriter luceneWriter) throws Exception { super(metadata); m_fieldConfig = fieldConfig; m_luceneWriter = luceneWriter; m_initialLuceneWriter = m_luceneWriter; init(); } public ToLuceneContentHandler(Metadata metadata, int writeLimit, FieldConfig fieldConfig, IndexWriter luceneWriter) throws Exception { super(metadata, writeLimit); m_fieldConfig = fieldConfig; m_luceneWriter = luceneWriter; m_initialLuceneWriter = m_luceneWriter; init(); } /** * Will merge all temporar indices together into the initial indexWriter index. This is only necessary if SplitAndMerge is enabled. Otherwise you don't have to invoke * this method. */ @Override public void crawlFinished() { try { for (int i = 0; i < m_llConsumerThreads.size(); i++) m_addDocsQueue.put(new InterruptThreadList()); m_cyclicBarrier4DocConsumerThreads.await(); m_llConsumerThreads.clear(); if (getSplitAndMergeIndex() <= 0) return; // hier mergen wir nun alle temporren indices in den originalen // der temporren mssen noch geschlossen werden - das machen wir jetzt. Der letzte steht noch nicht in der Liste if (m_luceneWriter != m_initialLuceneWriter) { for (IndexWriter writer2close : m_llIndexWriter2Close) writer2close.close(); m_luceneWriter.close(); } LinkedList<Directory> llIndicesDirs2Merge = new LinkedList<Directory>(); for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge) llIndicesDirs2Merge.add(new SimpleFSDirectory(Paths.get(strTmpPath))); if (llIndicesDirs2Merge.size() == 0) return; Logger.getLogger(ToLuceneContentHandler.class.getName()) .info("Will merge " + llIndicesDirs2Merge.size() + " temporary indices to the final one."); m_initialLuceneWriter.addIndexes(llIndicesDirs2Merge.toArray(new Directory[0])); m_initialLuceneWriter.commit(); for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge) FileUtils.deleteDirectory(new File(strTmpPath)); } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e); } } public boolean getBlockIndexing() { return m_bBlockIndexing; } /** * Gets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You * can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized * list. * * @return the current field aggregation map */ public MultiValueHashMap<String, String> getFieldAggregationMap() { return m_hsTarget2SourcesFieldnames; } /** * Gets the field config * * @return the field config */ public FieldConfig getFieldConfig() { return m_fieldConfig; } /** * Gets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into * several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename * attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)} * * @return the current field mappings */ public MultiValueHashMap<String, String> getFieldCopyMap() { return m_hsSource2TargetFieldnames; } /** * Gets the set of field names / metadata key values that will NOT be stored into the lucene index. * * @return the set of field names / metadata key values that will NOT be stored into the lucene index. */ public HashSet<String> getFields2Ignore() { return m_hsAttNamesNot2Store; } /** * All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values * * @return the fieldname-value pairs. At least one have to match that a document will be written into the index */ public Map<String, String> getIgnoreAllDocsWithout() { return m_hsFieldName2FieldValueConstraint; } /** * If split and merge is enabled, {@link ToLuceneContentHandler} will check at each {@link #processNewData(Metadata, String)} invocation whether the current * indexWriter has more than iSplitIndexDocumentCount documents. In the case it has more, {@link ToLuceneContentHandler} will create an entirely new index for * writing, until this one also gets 'overfilled'. In the case your crawl is finished, {@link Leech} invokes {@link ToLuceneContentHandler#crawlFinished()}. This will * merge all temporary indices into the initial indexWriter object. This is for performance reasons because writing into a Lucene index tends to get slow after a * certain size. Splitting and merging afterwards is faster. * * @return the document count a new index will be created */ public int getSplitAndMergeIndex() { return m_iSplitIndexDocumentCount; } /** * Sets some attribute value pairs that will be added to every crawled document. * * @return the current static attribute value pairs */ public MultiValueHashMap<String, String> getStaticAttributeValuePairs() { return m_hsStaticAttValuePairs; } @Override public void processErrorData(Metadata metadata) { // NOP } @Override public void processModifiedData(Metadata metadata, String strFulltext) { try { // hier modifizieren wir ein schon vorhandenes Dokument Document luceneDocument = createAndFillLuceneDocument(metadata, strFulltext); if (luceneDocument == null) return; // TODO: was passiert hier mit block-indexierten Dokumenten? m_initialLuceneWriter.updateDocument(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId)), luceneDocument); } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", e); } } @Override public void processNewData(Metadata metadata, String strFulltext) { try { if (m_initialLuceneWriter == null) throw new IllegalStateException("Lucene writer was not specified"); m_luceneWriter = getCurrentWriter(); ensureConsumerThreadsRunning(); Document doc = createAndFillLuceneDocument(metadata, strFulltext); if (doc == null) return; // wenn es ein parent oder childDoc ist, dann merken wir uns dieses erst mal, bis wir einen ganzen Block haben. Wenn wir auf ein childDoc // stossen, dann schreiben wir beim nchsten parendDoc, und merken uns alle childs bis dahin // - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im // Block if (ToLuceneContentHandler.this.getBlockIndexing()) { if (metadata.get(LeechMetadata.parentId) != null) { // wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach m_llLastChildDocuments.add(doc); } else if (metadata.get(LeechMetadata.childId) != null) { // wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das // parentDoc ist das letzte m_llLastChildDocuments.add(doc); m_addDocsQueue.put(new LinkedList<Document>(m_llLastChildDocuments)); m_llLastChildDocuments.clear(); } else { // wir haben weder child-noch parent ID - alle gemerkten childDocs werden als Einzeldocs rausgeschrieben for (Document orphanDoc : m_llLastChildDocuments) m_addDocsQueue.put(Collections.singletonList(orphanDoc)); m_addDocsQueue.put(Collections.singletonList(doc)); } } else { m_addDocsQueue.put(Collections.singletonList(doc)); } } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e); } } public void processNewDocument(Document doc) { try { if (m_initialLuceneWriter == null) throw new IllegalStateException("Lucene writer was not specified"); m_luceneWriter = getCurrentWriter(); ensureConsumerThreadsRunning(); if (doc == null) return; // wenn es ein parent oder childDoc ist, dann merken wir uns dieses erst mal, bis wir einen ganzen Block haben. Wenn wir auf ein childDoc // stossen, dann schreiben wir beim nchsten parendDoc, und merken uns alle childs bis dahin // - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im // Block if (ToLuceneContentHandler.this.getBlockIndexing()) { if (doc.get(LeechMetadata.parentId) != null) { // wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach m_llLastChildDocuments.add(doc); } else if (doc.get(LeechMetadata.childId) != null) { // wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das // parentDoc ist das letzte m_llLastChildDocuments.add(doc); m_addDocsQueue.put(new LinkedList<Document>(m_llLastChildDocuments)); m_llLastChildDocuments.clear(); } else { // wir haben weder child-noch parent ID - alle gemerkten childDocs werden als Einzeldocs rausgeschrieben for (Document orphanDoc : m_llLastChildDocuments) m_addDocsQueue.put(Collections.singletonList(orphanDoc)); m_addDocsQueue.put(Collections.singletonList(doc)); } } else { m_addDocsQueue.put(Collections.singletonList(doc)); } } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e); } } @Override public void processProcessedData(Metadata metadata) { // NOP } @Override public void processRemovedData(Metadata metadata) { // da kann man ja mit den inkremental-Ids spielen, die stehen ja evtl. noch in den Metadaten drin :) :) try { // TODO: was passiert hier mit block-indexierten Dokumenten? m_initialLuceneWriter.deleteDocuments(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId))); } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", e); } } @Override public void processUnmodifiedData(Metadata metadata) { // NOP } /** * Sets whether block indexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is enabled or not. If it is enabled, * {@link ToLuceneContentHandler} checks whether inside the metadata is a {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a * {@link LeechMetadata#childId} entry will appear as parent documents, docs with an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects * the child documents if they appear at a processXXX method, and writes them as block at the time a succeeding parent document appears. In the case a non-parent doc * appears, all collected docs will be indexed normally, not as block. * * @param blockIndexing true in the case blockindexing should be inabled, false otherwise. */ public void setBlockIndexing(boolean blockIndexing) { this.m_bBlockIndexing = blockIndexing; } /** * Sets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You * can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized * list. * * @param hsTarget2SourcesFieldnames the field aggregation map */ public void setFieldAggregationMap(MultiValueHashMap<String, String> hsTarget2SourcesFieldnames) { m_hsTarget2SourcesFieldnames = hsTarget2SourcesFieldnames; } /** * Sets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into * several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename * attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)} * * @param hsSource2TargetFieldnames keys: source field names, given as metadata keys. values: target field names - the content will also appear under these fields * inside a lucene document */ public void setFieldCopyMap(MultiValueHashMap<String, String> hsSource2TargetFieldnames) { m_hsSource2TargetFieldnames = hsSource2TargetFieldnames; } /** * Sets the set of field names / metadata key values that will NOT be stored into the lucene index. Nevertheless, you can consider these in * {@link #setFieldCopyMap(MultiValueHashMap)}. In this case you have 'moved' the attribute value into another attribute (or several ones). * * @param hsAttNamesNot2Store the set of attribute/field names that will not stored into the lucene index */ public void setFieldNames2Ignore(HashSet<String> hsAttNamesNot2Store) { m_hsAttNamesNot2Store = hsAttNamesNot2Store; } /** * All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values. If this is set to null or to * an empty map, all documents will be accepted. * * @param hsFieldName2FieldValue the fieldname-value pairs. At least one have to match that a document will be written into the index * * @return this */ public ToLuceneContentHandler setIgnoreAllDocsWithout(Map<String, String> hsFieldName2FieldValue) { m_hsFieldName2FieldValueConstraint = hsFieldName2FieldValue; return this; } /** * If split and merge is enabled, {@link ToLuceneContentHandler} will check at each {@link #processNewData(Metadata, String)} invocation whether the current * indexWriter has more than iSplitIndexDocumentCount documents. In the case it has more, {@link ToLuceneContentHandler} will create an entirely new index for * writing, until this one also gets 'overfilled'. In the case your crawl is finished, invoking {@link ToLuceneContentHandler#crawlFinished()} merges all temporary * indices into the initial indexWriter object. This invocation will be done automatically by the {@link Leech} class. This is for performance reasons because writing * into a Lucene index tends to get slow after a certain size. Splitting and merging afterwards is faster. Update: this behaviour depends on the Lucene version used, * currently this seems to be not a problem. Thus, this functionality is disabled per default. * * @param iSplitIndexDocumentCount the document count a new index will be created. A good size is 500 000 (from my stomach feeling, if it is necessary). -1 in the * case you want to disable SplitAndMerge, which is the default. * * @return this */ public ToLuceneContentHandler setSplitAndMergeIndex(int iSplitIndexDocumentCount) { m_iSplitIndexDocumentCount = iSplitIndexDocumentCount; return this; } /** * Sets some attribute value pairs that will be added to every crawled document. * * @param hsStaticAttValuePairs a multi value map containing the additional attribute value pairs * * @return this */ public ToLuceneContentHandler setStaticAttributeValuePairs( MultiValueHashMap<String, String> hsStaticAttValuePairs) { m_hsStaticAttValuePairs = hsStaticAttValuePairs; return this; } protected void addStaticAttValuePairs(Document doc) throws Exception { for (Entry<String, String> fieldName2Value : getStaticAttributeValuePairs().entryList()) { IndexableField field = m_fieldConfig.createField(fieldName2Value.getKey(), fieldName2Value.getValue()); if (field != null) doc.add(field); else Logger.getLogger(ToLuceneContentHandler.class.getName()) .warning("Could not create lucene field for " + fieldName2Value.getKey() + ":" + fieldName2Value.getValue() + ". Will ignore it."); } } /** * Returns null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)}) * * @param metadata * @param strFulltext * * @return null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)}) * * @throws Exception */ protected Document createAndFillLuceneDocument(Metadata metadata, String strFulltext) throws Exception { // // wir erstellen kein Document-Object neu, wenn es nicht unbedingt ntig ist - dazu merken wir uns die Referenzen auf die schon allokierten // // Document Objekte // // Document Object reuse // Document doc = null; // for (Document preAllocatedDoc : m_llAllocatedDocuments) // { // if(!m_llLastChildDocuments.contains(preAllocatedDoc)) // { // doc = preAllocatedDoc; // LinkedList<String> llFieldNames = new // for (Fieldable field : doc.getFields()) // doc.removeFields(field.name()); // // break; // } // } // if(doc == null) // { // doc = new Document(); // m_llAllocatedDocuments.add(doc); // } Document doc = new Document(); // Das man kein Field aus einem reader machen kann ist der Grund, warum processNewMetaData den Fulltext als String und nicht als reader // bergibt // eine eindeutige ID mu da sein if (metadata.getValues(LeechMetadata.id).length == 0) doc.add(m_fieldConfig.createField(LeechMetadata.id, new UID().toString())); if (!getFields2Ignore().contains(LeechMetadata.body)) doc.add(m_fieldConfig.createField(LeechMetadata.body, strFulltext)); // die kopien for (String strFieldCopy : getFieldCopyMap().get(LeechMetadata.body)) if (!getFields2Ignore().contains(strFieldCopy)) doc.add(m_fieldConfig.createField(strFieldCopy, strFulltext)); // die restlichen metadaten for (String strFieldName : metadata.names()) { if (!getFields2Ignore().contains(strFieldName)) { for (String strValue : metadata.getValues(strFieldName)) { IndexableField field = m_fieldConfig.createField(strFieldName, strValue); if (field != null) doc.add(field); else Logger.getLogger(ToLuceneContentHandler.class.getName()) .warning("Could not create lucene field for " + strFieldName + ":" + strValue + ". Will ignore it."); } } // die kopien for (String strFieldCopy : getFieldCopyMap().get(strFieldName)) if (!getFields2Ignore().contains(strFieldCopy)) { for (String strValue : metadata.getValues(strFieldName)) { IndexableField field = m_fieldConfig.createField(strFieldCopy, strValue); if (field != null) doc.add(field); else Logger.getLogger(ToLuceneContentHandler.class.getName()) .warning("Could not create lucene field for " + strFieldCopy + ":" + strValue + ". Will ignore it."); } } } // die statischen Attribut-Value-Paare addStaticAttValuePairs(doc); // und jetzt aggregieren wir noch for (String strTargetAtt : getFieldAggregationMap().keySet()) { // wenn es das TargetAtt schon im doc gibt, dann aggregieren wir nix if (doc.get(strTargetAtt) != null) continue; Collection<String> colSourceAtts = getFieldAggregationMap().get(strTargetAtt); for (String strSourceAtt : colSourceAtts) { String strNewValue = metadata.get(strSourceAtt); if (strNewValue == null) strNewValue = getStaticAttributeValuePairs().getFirst(strSourceAtt); if (strNewValue != null) { IndexableField field = m_fieldConfig.createField(strTargetAtt, strNewValue); if (field != null) doc.add(field); else Logger.getLogger(ToLuceneContentHandler.class.getName()) .warning("Could not create lucene field for " + strTargetAtt + ":" + strNewValue + ". Will ignore it."); break; } } } // wenn ein Doc nicht unseren constraints entspricht, dann ignorieren wir das hier, indem wir null zurck geben if (m_hsFieldName2FieldValueConstraint == null || m_hsFieldName2FieldValueConstraint.size() == 0) return doc; for (Entry<String, String> fieldname2fieldValRegEx : m_hsFieldName2FieldValueConstraint.entrySet()) { IndexableField[] fieldables = doc.getFields(fieldname2fieldValRegEx.getKey()); for (IndexableField fieldable : fieldables) { String strVal = fieldable.stringValue(); if (strVal.matches(fieldname2fieldValRegEx.getValue())) { // wir haben einen Treffer return doc; } } } return null; } protected void ensureConsumerThreadsRunning() { if (m_llConsumerThreads.size() != 0) return; int iCoreCount = Runtime.getRuntime().availableProcessors(); int iThreadCount = (int) Math.round(iCoreCount / 2d); iThreadCount = Math.max(iThreadCount, 1); m_cyclicBarrier4DocConsumerThreads = new CyclicBarrier(iThreadCount + 1); for (int i = 0; i < iThreadCount; i++) { Thread consumerThread = new Thread(new DocConsumer(), "ToLuceneContentHandlerDocConsumer " + i); m_llConsumerThreads.add(consumerThread); consumerThread.setDaemon(true); consumerThread.start(); } } synchronized protected IndexWriter getCurrentWriter() throws CorruptIndexException, LockObtainFailedException, IOException { if (getSplitAndMergeIndex() <= 0) return m_initialLuceneWriter; if (m_luceneWriter.maxDoc() < getSplitAndMergeIndex()) return m_luceneWriter; Directory directory = m_initialLuceneWriter.getDirectory(); Path fOurTmpDir = null; if (directory instanceof FSDirectory) { if (m_luceneWriter != m_initialLuceneWriter) m_llIndexWriter2Close.add(m_luceneWriter); String strTmpPath = ((FSDirectory) directory).getDirectory().toAbsolutePath().toString(); // if(strTmpPath.charAt(strTmpPath.length() - 1) == '/' || strTmpPath.charAt(strTmpPath.length() - 1) == '\\') // strTmpPath = strTmpPath.substring(0, strTmpPath.length() - 1); strTmpPath += "_" + (m_hsTmpLuceneWriterPaths2Merge.size() + 1); fOurTmpDir = Paths.get(strTmpPath); } else { // wir brauchen was temporres File parentDir = new File(System.getProperty("java.io.tmpdir")); fOurTmpDir = Paths.get(parentDir.getAbsolutePath() + "/leechTmp/" + UUID.randomUUID().toString().replaceAll("\\W", "_")); } Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Current index exceeds " + m_iSplitIndexDocumentCount + " documents. Will create another temporary one under " + fOurTmpDir); @SuppressWarnings("deprecation") IndexWriterConfig config = new IndexWriterConfig(m_initialLuceneWriter.getConfig().getAnalyzer()); config.setOpenMode(OpenMode.CREATE); m_luceneWriter = new IndexWriter(new SimpleFSDirectory(fOurTmpDir), config); m_hsTmpLuceneWriterPaths2Merge.add(fOurTmpDir.toAbsolutePath().toString()); return m_luceneWriter; } @Override protected void init() { Logger.getLogger(ToLuceneContentHandler.class.getName()) .info("Will write crawled data into " + m_luceneWriter.getDirectory().toString()); ensureConsumerThreadsRunning(); } }