org.apache.jackrabbit.oak.plugins.tika.TextExtractor.java Source code

Introduction

Here is the source code for org.apache.jackrabbit.oak.plugins.tika.TextExtractor.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.jackrabbit.oak.plugins.tika;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import com.google.common.io.ByteSource;
import com.google.common.io.CountingInputStream;
import org.apache.jackrabbit.oak.commons.IOUtils;
import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.WriteOutContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class TextExtractor implements Closeable {
    private static final Logger log = LoggerFactory.getLogger(TextExtractor.class);
    private static final Logger parserError = LoggerFactory
            .getLogger("org.apache.jackrabbit.oak.plugins.tika.ParserError");
    private static final int PROGRESS_BATCH_SIZE = 1000;
    private static final int MAX_EXTRACT_LENGTH = 100000;
    private static final String ERROR_TEXT = "TextExtractionError";

    private final TextWriter textWriter;

    private final WorkItem SHUTDOWN_SIGNAL = new WorkItem(null);
    private BlockingQueue<WorkItem> inputQueue;
    private ExecutorService executorService;
    private int threadPoolSize = Runtime.getRuntime().availableProcessors();
    private int queueSize = 100;

    private final AtomicInteger errorCount = new AtomicInteger();
    private final AtomicLong timeTaken = new AtomicLong();
    private final AtomicInteger extractionCount = new AtomicInteger();
    private final AtomicInteger textWrittenCount = new AtomicInteger();
    private final AtomicInteger parserErrorCount = new AtomicInteger();
    private final AtomicInteger processedCount = new AtomicInteger();
    private final AtomicInteger emptyCount = new AtomicInteger();
    private final AtomicInteger notSupportedCount = new AtomicInteger();
    private final AtomicInteger alreadyExtractedCount = new AtomicInteger();
    private final AtomicLong extractedTextSize = new AtomicLong();
    private final AtomicLong nonEmptyExtractedTextSize = new AtomicLong();
    private final AtomicLong totalSizeRead = new AtomicLong();

    private int maxExtractedLength = MAX_EXTRACT_LENGTH;
    private File tikaConfig;
    private TikaHelper tika;
    private boolean initialized;
    private BinaryStats stats;
    private boolean closed;

    public TextExtractor(TextWriter textWriter) {
        this.textWriter = textWriter;
    }

    public void extract(Iterable<BinaryResource> binaries) throws InterruptedException, IOException {
        initialize();
        for (BinaryResource binary : binaries) {
            inputQueue.put(new WorkItem(binary));
        }
    }

    @Override
    public void close() {
        if (closed) {
            return;
        }
        if (!inputQueue.isEmpty()) {
            log.info("Shutting down the extractor. Pending task count {}", inputQueue.size());
        }

        if (executorService != null) {
            try {
                inputQueue.put(SHUTDOWN_SIGNAL);
                executorService.shutdown();
                //Wait long enough
                executorService.awaitTermination(10, TimeUnit.DAYS);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            }
        }
        dumpStats();
        closed = true;
    }

    public void setTikaConfig(File tikaConfig) {
        this.tikaConfig = tikaConfig;
    }

    public void setThreadPoolSize(int threadPoolSize) {
        this.threadPoolSize = threadPoolSize;
    }

    public void setStats(BinaryStats stats) {
        this.stats = stats;
    }

    private void dumpStats() {
        StringWriter sw = new StringWriter();
        PrintWriter pw = new PrintWriter(sw);
        pw.println("Text extraction stats");
        pw.printf("\t Processed Count           : %d%n", processedCount.get());
        pw.printf("\t   Extraction Count        : %d%n", extractionCount.get());
        pw.printf("\t     Empty Count           : %d%n", emptyCount.get());
        pw.printf("\t     Text Written Count    : %d%n", textWrittenCount.get());
        pw.printf("\t   Parser Error Count      : %d%n", parserErrorCount.get());
        pw.printf("\t   Error Count             : %d%n", errorCount.get());
        pw.printf("\t   Not Supported Count     : %d%n", notSupportedCount.get());
        pw.printf("\t   Already processed Count : %d%n", alreadyExtractedCount.get());
        pw.printf("\t Total bytes read          : %s%n", IOUtils.humanReadableByteCount(totalSizeRead.get()));
        pw.printf("\t Total text extracted      : %s%n", IOUtils.humanReadableByteCount(extractedTextSize.get()));
        pw.printf("\t   Non empty text          : %s%n",
                IOUtils.humanReadableByteCount(nonEmptyExtractedTextSize.get()));
        pw.printf("\t Time taken                : %d sec%n", timeTaken.get() / 1000);
        pw.close();
        log.info(sw.toString());
    }

    private void dumpProgress(int count) {
        if (count % PROGRESS_BATCH_SIZE == 0) {
            String progress = "";
            if (stats != null) {
                double processedPercent = count * 1.0 / stats.getTotalCount() * 100;
                double indexedPercent = extractionCount.get() * 1.0 / stats.getIndexedCount() * 100;
                progress = String.format("(%1.2f%%) (Extraction stats %d/%d %1.2f%%, Ignored count %d)",
                        processedPercent, extractionCount.get(), stats.getIndexedCount(), indexedPercent,
                        notSupportedCount.get());
            }
            log.info("Processed {} {} binaries so far ...", count, progress);
        }
    }

    private synchronized void initialize() throws IOException {
        if (initialized) {
            return;
        }
        inputQueue = new ArrayBlockingQueue<WorkItem>(queueSize);
        tika = new TikaHelper(tikaConfig);
        initializeExecutorService();
        initialized = true;
    }

    private void extractText(BinaryResource source) throws IOException {
        String type = source.getMimeType();
        if (type == null || !tika.isSupportedMediaType(type)) {
            log.trace("Ignoring binary content for node {} due to unsupported " + "(or null) jcr:mimeType [{}]",
                    source, type);
            notSupportedCount.incrementAndGet();
            return;
        }

        String blobId = source.getBlobId();
        if (textWriter.isProcessed(blobId)) {
            alreadyExtractedCount.incrementAndGet();
            return;
        }

        //TODO Handle case where same blob is being concurrently processed
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, type);
        if (source.getEncoding() != null) { // not mandatory
            metadata.set(Metadata.CONTENT_ENCODING, source.getEncoding());
        }

        String extractedContent = parseStringValue(source.getByteSource(), metadata, source.getPath());
        if (ERROR_TEXT.equals(extractedContent)) {
            textWriter.markError(blobId);
        } else if (extractedContent != null) {
            extractedContent = extractedContent.trim();
            if (!extractedContent.isEmpty()) {
                nonEmptyExtractedTextSize.addAndGet(extractedContent.length());
                textWriter.write(blobId, extractedContent);
                textWrittenCount.incrementAndGet();
            } else {
                textWriter.markEmpty(blobId);
                emptyCount.incrementAndGet();
            }
        }
    }

    private void initializeExecutorService() {
        executorService = Executors.newFixedThreadPool(threadPoolSize);
        for (int i = 0; i < threadPoolSize; i++) {
            executorService.submit(new Extractor());
        }
        log.info("Initialized text extractor pool with {} threads", threadPoolSize);
    }

    private class Extractor implements Runnable {
        @Override
        public void run() {
            while (true) {
                WorkItem workItem = null;
                try {
                    workItem = inputQueue.take();
                    if (workItem == SHUTDOWN_SIGNAL) {
                        inputQueue.put(SHUTDOWN_SIGNAL); //put back for other workers
                        return;
                    }
                    extractText(workItem.source);
                    dumpProgress(processedCount.incrementAndGet());
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                    return;
                } catch (Exception e) {
                    errorCount.incrementAndGet();
                    log.warn("Error occurred while processing {}", workItem, e);
                }
            }
        }
    }

    //~--------------------------------------< Tika >

    private String parseStringValue(ByteSource byteSource, Metadata metadata, String path) {
        WriteOutContentHandler handler = new WriteOutContentHandler(maxExtractedLength);
        long start = System.currentTimeMillis();
        long size = 0;
        try {
            CountingInputStream stream = new CountingInputStream(new LazyInputStream(byteSource));
            try {
                tika.getParser().parse(stream, handler, metadata, new ParseContext());
            } finally {
                size = stream.getCount();
                stream.close();
            }
        } catch (LinkageError e) {
            // Capture and ignore errors caused by extraction libraries
            // not being present. This is equivalent to disabling
            // selected media types in configuration, so we can simply
            // ignore these errors.
        } catch (Throwable t) {
            // Capture and report any other full text extraction problems.
            // The special STOP exception is used for normal termination.
            if (!handler.isWriteLimitReached(t)) {
                parserErrorCount.incrementAndGet();
                parserError.debug("Failed to extract text from a binary property: " + path
                        + " This is a fairly common case, and nothing to"
                        + " worry about. The stack trace is included to"
                        + " help improve the text extraction feature.", t);
                return ERROR_TEXT;
            }
        }
        String result = handler.toString();
        timeTaken.addAndGet(System.currentTimeMillis() - start);
        if (size > 0) {
            extractedTextSize.addAndGet(result.length());
            extractionCount.incrementAndGet();
            totalSizeRead.addAndGet(size);
            return result;
        }

        return null;
    }

    //~--------------------------------------< WorkItem >

    private static class WorkItem {
        final BinaryResource source;

        private WorkItem(BinaryResource source) {
            this.source = source;
        }

        @Override
        public String toString() {
            return source != null ? source.toString() : "<EMPTY>";
        }
    }

}