de.dfki.km.leech.lucene.ToLuceneContentHandler.java Source code

Java tutorial

Introduction

Here is the source code for de.dfki.km.leech.lucene.ToLuceneContentHandler.java

Source

/*
 * Leech - crawling capabilities for Apache Tika
 * 
 * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
 * either version 3 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contact us by mail: christian.reuschling@dfki.de
 */

package de.dfki.km.leech.lucene;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.rmi.server.UID;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.UUID;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.metadata.Metadata;

import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.inquisition.file.FileUtils;
import de.dfki.inquisition.lucene.FieldConfig;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.metadata.LeechMetadata;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.sax.DataSinkContentHandler;

/**
 * This is a content handler that allows to store crawled data into a Lucene index. You are able to configure the field types and the analyzers that should be used.
 * Further, blockindexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is supported, you can enable it with
 * {@link ToLuceneContentHandler#setBlockIndexing(boolean)}. If it is enabled, {@link ToLuceneContentHandler} checks whether inside the metadata is a
 * {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a {@link LeechMetadata#childId} entry will appear as parent documents, docs with
 * an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects the child documents if they appear at a processXXX method, and writes them as
 * block at the time a succeeding parent document appears. In the case a non-parent doc appears, all collected docs will be indexed normally, not as block.
 * 
 * @author Christian Reuschling, Dipl.Ing.(BA)
 * 
 */
public class ToLuceneContentHandler extends DataSinkContentHandler {

    protected class DocConsumer implements Runnable {

        @Override
        public void run() {
            try {
                while (true) {
                    List<Document> llDocs = m_addDocsQueue.take();

                    if (llDocs instanceof InterruptThreadList) {
                        break;
                    }

                    try {

                        if (llDocs.size() == 1) {
                            getCurrentWriter().addDocument(llDocs.get(0));
                        } else if (llDocs.size() > 1) {
                            getCurrentWriter().addDocuments(llDocs);
                        }

                    } catch (Exception e) {
                        Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.WARNING,
                                "Error during writing a document to the index (lucene exception while addDocument) - will ignore it. This is a hint to a lucene bug."
                                        + llDocs);
                    }

                }
            } catch (InterruptedException e) {
                // NOP
            } catch (Exception e) {
                Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error", e);

            } finally {
                try {
                    m_cyclicBarrier4DocConsumerThreads.await();
                } catch (Exception e2) {
                    Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error",
                            e2);
                }
            }

        }
    }

    protected class InterruptThreadList extends LinkedList<Document> {
        private static final long serialVersionUID = 196832081918659203L;
    }

    protected final BlockingQueue<List<Document>> m_addDocsQueue = new LinkedBlockingQueue<List<Document>>(23);

    protected boolean m_bBlockIndexing = true;

    protected CyclicBarrier m_cyclicBarrier4DocConsumerThreads;

    protected FieldConfig m_fieldConfig = new FieldConfig();

    protected HashSet<String> m_hsAttNamesNot2Store = new HashSet<String>();

    protected Map<String, String> m_hsFieldName2FieldValueConstraint;

    protected MultiValueHashMap<String, String> m_hsSource2TargetFieldnames = new MultiValueHashMap<String, String>();

    protected MultiValueHashMap<String, String> m_hsStaticAttValuePairs = new MultiValueHashMap<String, String>();

    protected MultiValueHashMap<String, String> m_hsTarget2SourcesFieldnames = new MultiValueHashMap<String, String>();

    protected HashSet<String> m_hsTmpLuceneWriterPaths2Merge = new HashSet<String>();

    protected IndexWriter m_initialLuceneWriter;

    protected int m_iSplitIndexDocumentCount = -1;

    protected LinkedList<Thread> m_llConsumerThreads = new LinkedList<Thread>();

    protected LinkedList<IndexWriter> m_llIndexWriter2Close = new LinkedList<IndexWriter>();

    protected LinkedList<Document> m_llLastChildDocuments = new LinkedList<Document>();

    protected IndexWriter m_luceneWriter;

    public ToLuceneContentHandler(FieldConfig fieldConfig, IndexWriter luceneWriter) throws Exception {
        super();
        m_fieldConfig = fieldConfig;
        m_luceneWriter = luceneWriter;
        m_initialLuceneWriter = m_luceneWriter;

        init();
    }

    public ToLuceneContentHandler(int writeLimit, FieldConfig fieldConfig, IndexWriter luceneWriter)
            throws Exception {
        super(writeLimit);
        m_fieldConfig = fieldConfig;
        m_luceneWriter = luceneWriter;
        m_initialLuceneWriter = m_luceneWriter;

        init();
    }

    public ToLuceneContentHandler(Metadata metadata, FieldConfig fieldConfig, IndexWriter luceneWriter)
            throws Exception {
        super(metadata);
        m_fieldConfig = fieldConfig;
        m_luceneWriter = luceneWriter;
        m_initialLuceneWriter = m_luceneWriter;

        init();
    }

    public ToLuceneContentHandler(Metadata metadata, int writeLimit, FieldConfig fieldConfig,
            IndexWriter luceneWriter) throws Exception {
        super(metadata, writeLimit);
        m_fieldConfig = fieldConfig;
        m_luceneWriter = luceneWriter;
        m_initialLuceneWriter = m_luceneWriter;

        init();
    }

    /**
     * Will merge all temporar indices together into the initial indexWriter index. This is only necessary if SplitAndMerge is enabled. Otherwise you don't have to invoke
     * this method.
     */
    @Override
    public void crawlFinished() {
        try {

            for (int i = 0; i < m_llConsumerThreads.size(); i++)
                m_addDocsQueue.put(new InterruptThreadList());

            m_cyclicBarrier4DocConsumerThreads.await();

            m_llConsumerThreads.clear();

            if (getSplitAndMergeIndex() <= 0)
                return;

            // hier mergen wir nun alle temporren indices in den originalen

            // der temporren mssen noch geschlossen werden - das machen wir jetzt. Der letzte steht noch nicht in der Liste
            if (m_luceneWriter != m_initialLuceneWriter) {
                for (IndexWriter writer2close : m_llIndexWriter2Close)
                    writer2close.close();
                m_luceneWriter.close();
            }

            LinkedList<Directory> llIndicesDirs2Merge = new LinkedList<Directory>();

            for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge)
                llIndicesDirs2Merge.add(new SimpleFSDirectory(Paths.get(strTmpPath)));

            if (llIndicesDirs2Merge.size() == 0)
                return;

            Logger.getLogger(ToLuceneContentHandler.class.getName())
                    .info("Will merge " + llIndicesDirs2Merge.size() + " temporary indices to the final one.");

            m_initialLuceneWriter.addIndexes(llIndicesDirs2Merge.toArray(new Directory[0]));

            m_initialLuceneWriter.commit();

            for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge)
                FileUtils.deleteDirectory(new File(strTmpPath));

        } catch (Exception e) {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
        }

    }

    public boolean getBlockIndexing() {
        return m_bBlockIndexing;
    }

    /**
     * Gets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You
     * can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized
     * list.
     * 
     * @return the current field aggregation map
     */
    public MultiValueHashMap<String, String> getFieldAggregationMap() {
        return m_hsTarget2SourcesFieldnames;
    }

    /**
     * Gets the field config
     * 
     * @return the field config
     */
    public FieldConfig getFieldConfig() {
        return m_fieldConfig;
    }

    /**
     * Gets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into
     * several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename
     * attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)}
     * 
     * @return the current field mappings
     */
    public MultiValueHashMap<String, String> getFieldCopyMap() {
        return m_hsSource2TargetFieldnames;
    }

    /**
     * Gets the set of field names / metadata key values that will NOT be stored into the lucene index.
     * 
     * @return the set of field names / metadata key values that will NOT be stored into the lucene index.
     */
    public HashSet<String> getFields2Ignore() {
        return m_hsAttNamesNot2Store;
    }

    /**
     * All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values
     * 
     * @return the fieldname-value pairs. At least one have to match that a document will be written into the index
     */
    public Map<String, String> getIgnoreAllDocsWithout() {
        return m_hsFieldName2FieldValueConstraint;
    }

    /**
     * If split and merge is enabled, {@link ToLuceneContentHandler} will check at each {@link #processNewData(Metadata, String)} invocation whether the current
     * indexWriter has more than iSplitIndexDocumentCount documents. In the case it has more, {@link ToLuceneContentHandler} will create an entirely new index for
     * writing, until this one also gets 'overfilled'. In the case your crawl is finished, {@link Leech} invokes {@link ToLuceneContentHandler#crawlFinished()}. This will
     * merge all temporary indices into the initial indexWriter object. This is for performance reasons because writing into a Lucene index tends to get slow after a
     * certain size. Splitting and merging afterwards is faster.
     * 
     * @return the document count a new index will be created
     */
    public int getSplitAndMergeIndex() {
        return m_iSplitIndexDocumentCount;
    }

    /**
     * Sets some attribute value pairs that will be added to every crawled document.
     * 
     * @return the current static attribute value pairs
     */
    public MultiValueHashMap<String, String> getStaticAttributeValuePairs() {
        return m_hsStaticAttValuePairs;
    }

    @Override
    public void processErrorData(Metadata metadata) {
        // NOP
    }

    @Override
    public void processModifiedData(Metadata metadata, String strFulltext) {

        try {

            // hier modifizieren wir ein schon vorhandenes Dokument
            Document luceneDocument = createAndFillLuceneDocument(metadata, strFulltext);
            if (luceneDocument == null)
                return;

            // TODO: was passiert hier mit block-indexierten Dokumenten?
            m_initialLuceneWriter.updateDocument(new Term(IncrementalCrawlingHistory.dataEntityId,
                    metadata.get(IncrementalCrawlingHistory.dataEntityId)), luceneDocument);

        } catch (Exception e) {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE,
                    "Error during writing into the index", e);
        }

    }

    @Override
    public void processNewData(Metadata metadata, String strFulltext) {

        try {
            if (m_initialLuceneWriter == null)
                throw new IllegalStateException("Lucene writer was not specified");

            m_luceneWriter = getCurrentWriter();

            ensureConsumerThreadsRunning();

            Document doc = createAndFillLuceneDocument(metadata, strFulltext);
            if (doc == null)
                return;

            // wenn es ein parent oder childDoc ist, dann merken wir uns dieses erst mal, bis wir einen ganzen Block haben. Wenn wir auf ein childDoc
            // stossen, dann schreiben wir beim nchsten parendDoc, und merken uns alle childs bis dahin
            // - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im
            // Block

            if (ToLuceneContentHandler.this.getBlockIndexing()) {

                if (metadata.get(LeechMetadata.parentId) != null) {
                    // wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach
                    m_llLastChildDocuments.add(doc);
                } else if (metadata.get(LeechMetadata.childId) != null) {
                    // wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das
                    // parentDoc ist das letzte
                    m_llLastChildDocuments.add(doc);

                    m_addDocsQueue.put(new LinkedList<Document>(m_llLastChildDocuments));

                    m_llLastChildDocuments.clear();
                } else {
                    // wir haben weder child-noch parent ID - alle gemerkten childDocs werden als Einzeldocs rausgeschrieben
                    for (Document orphanDoc : m_llLastChildDocuments)
                        m_addDocsQueue.put(Collections.singletonList(orphanDoc));

                    m_addDocsQueue.put(Collections.singletonList(doc));
                }

            } else {
                m_addDocsQueue.put(Collections.singletonList(doc));
            }

        } catch (Exception e) {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
        }

    }

    public void processNewDocument(Document doc) {

        try {
            if (m_initialLuceneWriter == null)
                throw new IllegalStateException("Lucene writer was not specified");

            m_luceneWriter = getCurrentWriter();

            ensureConsumerThreadsRunning();

            if (doc == null)
                return;

            // wenn es ein parent oder childDoc ist, dann merken wir uns dieses erst mal, bis wir einen ganzen Block haben. Wenn wir auf ein childDoc
            // stossen, dann schreiben wir beim nchsten parendDoc, und merken uns alle childs bis dahin
            // - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im
            // Block

            if (ToLuceneContentHandler.this.getBlockIndexing()) {

                if (doc.get(LeechMetadata.parentId) != null) {
                    // wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach
                    m_llLastChildDocuments.add(doc);
                } else if (doc.get(LeechMetadata.childId) != null) {
                    // wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das
                    // parentDoc ist das letzte
                    m_llLastChildDocuments.add(doc);

                    m_addDocsQueue.put(new LinkedList<Document>(m_llLastChildDocuments));

                    m_llLastChildDocuments.clear();
                } else {
                    // wir haben weder child-noch parent ID - alle gemerkten childDocs werden als Einzeldocs rausgeschrieben
                    for (Document orphanDoc : m_llLastChildDocuments)
                        m_addDocsQueue.put(Collections.singletonList(orphanDoc));

                    m_addDocsQueue.put(Collections.singletonList(doc));
                }

            } else {
                m_addDocsQueue.put(Collections.singletonList(doc));
            }

        } catch (Exception e) {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
        }

    }

    @Override
    public void processProcessedData(Metadata metadata) {
        // NOP
    }

    @Override
    public void processRemovedData(Metadata metadata) {
        // da kann man ja mit den inkremental-Ids spielen, die stehen ja evtl. noch in den Metadaten drin :) :)

        try {

            // TODO: was passiert hier mit block-indexierten Dokumenten?
            m_initialLuceneWriter.deleteDocuments(new Term(IncrementalCrawlingHistory.dataEntityId,
                    metadata.get(IncrementalCrawlingHistory.dataEntityId)));

        } catch (Exception e) {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE,
                    "Error during writing into the index", e);
        }

    }

    @Override
    public void processUnmodifiedData(Metadata metadata) {
        // NOP
    }

    /**
     * Sets whether block indexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is enabled or not. If it is enabled,
     * {@link ToLuceneContentHandler} checks whether inside the metadata is a {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a
     * {@link LeechMetadata#childId} entry will appear as parent documents, docs with an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects
     * the child documents if they appear at a processXXX method, and writes them as block at the time a succeeding parent document appears. In the case a non-parent doc
     * appears, all collected docs will be indexed normally, not as block.
     * 
     * @param blockIndexing true in the case blockindexing should be inabled, false otherwise.
     */
    public void setBlockIndexing(boolean blockIndexing) {
        this.m_bBlockIndexing = blockIndexing;
    }

    /**
     * Sets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You
     * can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized
     * list.
     * 
     * @param hsTarget2SourcesFieldnames the field aggregation map
     */
    public void setFieldAggregationMap(MultiValueHashMap<String, String> hsTarget2SourcesFieldnames) {
        m_hsTarget2SourcesFieldnames = hsTarget2SourcesFieldnames;
    }

    /**
     * Sets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into
     * several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename
     * attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)}
     * 
     * @param hsSource2TargetFieldnames keys: source field names, given as metadata keys. values: target field names - the content will also appear under these fields
     *            inside a lucene document
     */
    public void setFieldCopyMap(MultiValueHashMap<String, String> hsSource2TargetFieldnames) {
        m_hsSource2TargetFieldnames = hsSource2TargetFieldnames;
    }

    /**
     * Sets the set of field names / metadata key values that will NOT be stored into the lucene index. Nevertheless, you can consider these in
     * {@link #setFieldCopyMap(MultiValueHashMap)}. In this case you have 'moved' the attribute value into another attribute (or several ones).
     * 
     * @param hsAttNamesNot2Store the set of attribute/field names that will not stored into the lucene index
     */
    public void setFieldNames2Ignore(HashSet<String> hsAttNamesNot2Store) {
        m_hsAttNamesNot2Store = hsAttNamesNot2Store;
    }

    /**
     * All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values. If this is set to null or to
     * an empty map, all documents will be accepted.
     * 
     * @param hsFieldName2FieldValue the fieldname-value pairs. At least one have to match that a document will be written into the index
     * 
     * @return this
     */
    public ToLuceneContentHandler setIgnoreAllDocsWithout(Map<String, String> hsFieldName2FieldValue) {
        m_hsFieldName2FieldValueConstraint = hsFieldName2FieldValue;

        return this;
    }

    /**
     * If split and merge is enabled, {@link ToLuceneContentHandler} will check at each {@link #processNewData(Metadata, String)} invocation whether the current
     * indexWriter has more than iSplitIndexDocumentCount documents. In the case it has more, {@link ToLuceneContentHandler} will create an entirely new index for
     * writing, until this one also gets 'overfilled'. In the case your crawl is finished, invoking {@link ToLuceneContentHandler#crawlFinished()} merges all temporary
     * indices into the initial indexWriter object. This invocation will be done automatically by the {@link Leech} class. This is for performance reasons because writing
     * into a Lucene index tends to get slow after a certain size. Splitting and merging afterwards is faster. Update: this behaviour depends on the Lucene version used,
     * currently this seems to be not a problem. Thus, this functionality is disabled per default.
     * 
     * @param iSplitIndexDocumentCount the document count a new index will be created. A good size is 500 000 (from my stomach feeling, if it is necessary). -1 in the
     *            case you want to disable SplitAndMerge, which is the default.
     * 
     * @return this
     */
    public ToLuceneContentHandler setSplitAndMergeIndex(int iSplitIndexDocumentCount) {
        m_iSplitIndexDocumentCount = iSplitIndexDocumentCount;

        return this;
    }

    /**
     * Sets some attribute value pairs that will be added to every crawled document.
     * 
     * @param hsStaticAttValuePairs a multi value map containing the additional attribute value pairs
     * 
     * @return this
     */
    public ToLuceneContentHandler setStaticAttributeValuePairs(
            MultiValueHashMap<String, String> hsStaticAttValuePairs) {
        m_hsStaticAttValuePairs = hsStaticAttValuePairs;

        return this;
    }

    protected void addStaticAttValuePairs(Document doc) throws Exception {
        for (Entry<String, String> fieldName2Value : getStaticAttributeValuePairs().entryList()) {
            IndexableField field = m_fieldConfig.createField(fieldName2Value.getKey(), fieldName2Value.getValue());
            if (field != null)
                doc.add(field);
            else
                Logger.getLogger(ToLuceneContentHandler.class.getName())
                        .warning("Could not create lucene field for " + fieldName2Value.getKey() + ":"
                                + fieldName2Value.getValue() + ". Will ignore it.");
        }
    }

    /**
     * Returns null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)})
     * 
     * @param metadata
     * @param strFulltext
     * 
     * @return null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)})
     * 
     * @throws Exception
     */
    protected Document createAndFillLuceneDocument(Metadata metadata, String strFulltext) throws Exception {
        // // wir erstellen kein Document-Object neu, wenn es nicht unbedingt ntig ist - dazu merken wir uns die Referenzen auf die schon allokierten
        // // Document Objekte
        // // Document Object reuse
        // Document doc = null;
        // for (Document preAllocatedDoc : m_llAllocatedDocuments)
        // {
        // if(!m_llLastChildDocuments.contains(preAllocatedDoc))
        // {
        // doc = preAllocatedDoc;
        // LinkedList<String> llFieldNames = new
        // for (Fieldable field : doc.getFields())
        // doc.removeFields(field.name());
        //
        // break;
        // }
        // }
        // if(doc == null)
        // {
        // doc = new Document();
        // m_llAllocatedDocuments.add(doc);
        // }

        Document doc = new Document();

        // Das man kein Field aus einem reader machen kann ist der Grund, warum processNewMetaData den Fulltext als String und nicht als reader
        // bergibt

        // eine eindeutige ID mu da sein
        if (metadata.getValues(LeechMetadata.id).length == 0)
            doc.add(m_fieldConfig.createField(LeechMetadata.id, new UID().toString()));
        if (!getFields2Ignore().contains(LeechMetadata.body))
            doc.add(m_fieldConfig.createField(LeechMetadata.body, strFulltext));
        // die kopien
        for (String strFieldCopy : getFieldCopyMap().get(LeechMetadata.body))
            if (!getFields2Ignore().contains(strFieldCopy))
                doc.add(m_fieldConfig.createField(strFieldCopy, strFulltext));

        // die restlichen metadaten
        for (String strFieldName : metadata.names()) {
            if (!getFields2Ignore().contains(strFieldName)) {
                for (String strValue : metadata.getValues(strFieldName)) {
                    IndexableField field = m_fieldConfig.createField(strFieldName, strValue);
                    if (field != null)
                        doc.add(field);
                    else
                        Logger.getLogger(ToLuceneContentHandler.class.getName())
                                .warning("Could not create lucene field for " + strFieldName + ":" + strValue
                                        + ". Will ignore it.");
                }

            }

            // die kopien
            for (String strFieldCopy : getFieldCopyMap().get(strFieldName))
                if (!getFields2Ignore().contains(strFieldCopy)) {
                    for (String strValue : metadata.getValues(strFieldName)) {
                        IndexableField field = m_fieldConfig.createField(strFieldCopy, strValue);
                        if (field != null)
                            doc.add(field);
                        else
                            Logger.getLogger(ToLuceneContentHandler.class.getName())
                                    .warning("Could not create lucene field for " + strFieldCopy + ":" + strValue
                                            + ". Will ignore it.");
                    }
                }
        }

        // die statischen Attribut-Value-Paare
        addStaticAttValuePairs(doc);

        // und jetzt aggregieren wir noch
        for (String strTargetAtt : getFieldAggregationMap().keySet()) {
            // wenn es das TargetAtt schon im doc gibt, dann aggregieren wir nix
            if (doc.get(strTargetAtt) != null)
                continue;

            Collection<String> colSourceAtts = getFieldAggregationMap().get(strTargetAtt);

            for (String strSourceAtt : colSourceAtts) {
                String strNewValue = metadata.get(strSourceAtt);
                if (strNewValue == null)
                    strNewValue = getStaticAttributeValuePairs().getFirst(strSourceAtt);

                if (strNewValue != null) {
                    IndexableField field = m_fieldConfig.createField(strTargetAtt, strNewValue);
                    if (field != null)
                        doc.add(field);
                    else
                        Logger.getLogger(ToLuceneContentHandler.class.getName())
                                .warning("Could not create lucene field for " + strTargetAtt + ":" + strNewValue
                                        + ". Will ignore it.");

                    break;
                }
            }
        }

        // wenn ein Doc nicht unseren constraints entspricht, dann ignorieren wir das hier, indem wir null zurck geben
        if (m_hsFieldName2FieldValueConstraint == null || m_hsFieldName2FieldValueConstraint.size() == 0)
            return doc;

        for (Entry<String, String> fieldname2fieldValRegEx : m_hsFieldName2FieldValueConstraint.entrySet()) {
            IndexableField[] fieldables = doc.getFields(fieldname2fieldValRegEx.getKey());
            for (IndexableField fieldable : fieldables) {
                String strVal = fieldable.stringValue();
                if (strVal.matches(fieldname2fieldValRegEx.getValue())) {
                    // wir haben einen Treffer
                    return doc;
                }
            }
        }

        return null;
    }

    protected void ensureConsumerThreadsRunning() {
        if (m_llConsumerThreads.size() != 0)
            return;

        int iCoreCount = Runtime.getRuntime().availableProcessors();
        int iThreadCount = (int) Math.round(iCoreCount / 2d);
        iThreadCount = Math.max(iThreadCount, 1);

        m_cyclicBarrier4DocConsumerThreads = new CyclicBarrier(iThreadCount + 1);
        for (int i = 0; i < iThreadCount; i++) {
            Thread consumerThread = new Thread(new DocConsumer(), "ToLuceneContentHandlerDocConsumer " + i);
            m_llConsumerThreads.add(consumerThread);
            consumerThread.setDaemon(true);

            consumerThread.start();
        }
    }

    synchronized protected IndexWriter getCurrentWriter()
            throws CorruptIndexException, LockObtainFailedException, IOException {

        if (getSplitAndMergeIndex() <= 0)
            return m_initialLuceneWriter;

        if (m_luceneWriter.maxDoc() < getSplitAndMergeIndex())
            return m_luceneWriter;

        Directory directory = m_initialLuceneWriter.getDirectory();

        Path fOurTmpDir = null;
        if (directory instanceof FSDirectory) {
            if (m_luceneWriter != m_initialLuceneWriter)
                m_llIndexWriter2Close.add(m_luceneWriter);

            String strTmpPath = ((FSDirectory) directory).getDirectory().toAbsolutePath().toString();
            // if(strTmpPath.charAt(strTmpPath.length() - 1) == '/' || strTmpPath.charAt(strTmpPath.length() - 1) == '\\')
            // strTmpPath = strTmpPath.substring(0, strTmpPath.length() - 1);
            strTmpPath += "_" + (m_hsTmpLuceneWriterPaths2Merge.size() + 1);
            fOurTmpDir = Paths.get(strTmpPath);
        } else {
            // wir brauchen was temporres
            File parentDir = new File(System.getProperty("java.io.tmpdir"));
            fOurTmpDir = Paths.get(parentDir.getAbsolutePath() + "/leechTmp/"
                    + UUID.randomUUID().toString().replaceAll("\\W", "_"));
        }

        Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Current index exceeds "
                + m_iSplitIndexDocumentCount + " documents. Will create another temporary one under " + fOurTmpDir);

        @SuppressWarnings("deprecation")
        IndexWriterConfig config = new IndexWriterConfig(m_initialLuceneWriter.getConfig().getAnalyzer());
        config.setOpenMode(OpenMode.CREATE);

        m_luceneWriter = new IndexWriter(new SimpleFSDirectory(fOurTmpDir), config);
        m_hsTmpLuceneWriterPaths2Merge.add(fOurTmpDir.toAbsolutePath().toString());

        return m_luceneWriter;
    }

    @Override
    protected void init() {
        Logger.getLogger(ToLuceneContentHandler.class.getName())
                .info("Will write crawled data into " + m_luceneWriter.getDirectory().toString());

        ensureConsumerThreadsRunning();
    }

}