org.apache.jackrabbit.oak.plugins.tika.TextExtractorMain.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.jackrabbit.oak.plugins.tika.TextExtractorMain.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.jackrabbit.oak.plugins.tika;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.util.Arrays.asList;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;

import com.google.common.collect.Maps;
import com.google.common.io.Closer;
import com.mongodb.MongoClientURI;
import com.mongodb.MongoURI;
import joptsimple.OptionParser;
import joptsimple.OptionSet;
import joptsimple.OptionSpec;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.aws.ext.ds.S3DataStore;
import org.apache.jackrabbit.core.data.DataStore;
import org.apache.jackrabbit.core.data.DataStoreException;
import org.apache.jackrabbit.core.data.FileDataStore;
import org.apache.jackrabbit.oak.commons.PropertiesUtil;
import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore;
import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreTextWriter;
import org.apache.jackrabbit.oak.plugins.document.DocumentMK;
import org.apache.jackrabbit.oak.plugins.document.DocumentNodeStore;
import org.apache.jackrabbit.oak.plugins.document.util.MongoConnection;
import org.apache.jackrabbit.oak.plugins.segment.file.FileStore;
import org.apache.jackrabbit.oak.spi.blob.BlobStore;
import org.apache.jackrabbit.oak.spi.state.NodeStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TextExtractorMain {
    private static final Logger log = LoggerFactory.getLogger(TextExtractorMain.class);

    public static void main(String[] args) throws Exception {
        Closer closer = Closer.create();
        String h = "tika [extract|report|generate]\n" + "\n"
                + "report   : Generates a summary report related to binary data\n"
                + "extract  : Performs the text extraction\n"
                + "generate : Generates the csv data file based on configured NodeStore/BlobStore";
        try {
            OptionParser parser = new OptionParser();
            OptionSpec<?> help = parser.acceptsAll(asList("h", "?", "help"), "show help").forHelp();

            OptionSpec<String> nodeStoreSpec = parser
                    .accepts("nodestore", "NodeStore detail /path/to/oak/repository | mongodb://host:port/database")
                    .withRequiredArg().ofType(String.class);

            OptionSpec segmentTar = parser.accepts("segment-tar", "Use oak-segment-tar instead of oak-segment");

            OptionSpec<String> pathSpec = parser
                    .accepts("path", "Path in repository under which the binaries would be searched")
                    .withRequiredArg().ofType(String.class);

            OptionSpec<File> dataFileSpec = parser
                    .accepts("data-file", "Data file in csv format containing the binary metadata")
                    .withRequiredArg().ofType(File.class);

            OptionSpec<File> tikaConfigSpec = parser.accepts("tika-config", "Tika config file path")
                    .withRequiredArg().ofType(File.class);

            OptionSpec<File> fdsDirSpec = parser.accepts("fds-path", "Path of directory used by FileDataStore")
                    .withRequiredArg().ofType(File.class);

            OptionSpec<File> s3ConfigSpec = parser
                    .accepts("s3-config-path", "Path of properties file containing config for S3DataStore")
                    .withRequiredArg().ofType(File.class);

            OptionSpec<File> storeDirSpec = parser
                    .accepts("store-path", "Path of directory used to store extracted text content")
                    .withRequiredArg().ofType(File.class);

            OptionSpec<Integer> poolSize = parser
                    .accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults "
                            + "to number of cores on the system")
                    .withRequiredArg().ofType(Integer.class);

            OptionSpec<String> nonOption = parser.nonOptions(h);

            OptionSet options = parser.parse(args);
            List<String> nonOptions = nonOption.values(options);

            if (options.has(help)) {
                parser.printHelpOn(System.out);
                System.exit(0);
            }

            if (nonOptions.isEmpty()) {
                parser.printHelpOn(System.err);
                System.exit(1);
            }

            boolean report = nonOptions.contains("report");
            boolean extract = nonOptions.contains("extract");
            boolean generate = nonOptions.contains("generate");
            File dataFile = null;
            File storeDir = null;
            File tikaConfigFile = null;
            BlobStore blobStore = null;
            BinaryResourceProvider binaryResourceProvider = null;
            BinaryStats stats = null;
            String path = "/";

            if (options.has(tikaConfigSpec)) {
                tikaConfigFile = tikaConfigSpec.value(options);
                checkArgument(tikaConfigFile.exists(), "Tika config file %s does not exist",
                        tikaConfigFile.getAbsolutePath());
            }

            if (options.has(storeDirSpec)) {
                storeDir = storeDirSpec.value(options);
                if (storeDir.exists()) {
                    checkArgument(storeDir.isDirectory(),
                            "Path [%s] specified for storing extracted " + "text content '%s' is not a directory",
                            storeDir.getAbsolutePath(), storeDirSpec.options());
                }
            }

            if (options.has(fdsDirSpec)) {
                File fdsDir = fdsDirSpec.value(options);
                checkArgument(fdsDir.exists(), "FileDataStore %s does not exist", fdsDir.getAbsolutePath());
                FileDataStore fds = new FileDataStore();
                fds.setPath(fdsDir.getAbsolutePath());
                fds.init(null);
                blobStore = new DataStoreBlobStore(fds);
            }

            if (options.has(s3ConfigSpec)) {
                File s3Config = s3ConfigSpec.value(options);
                checkArgument(s3Config.exists() && s3Config.canRead(),
                        "S3DataStore config cannot be read from [%s]", s3Config.getAbsolutePath());
                Properties props = loadProperties(s3Config);
                log.info("Loaded properties for S3DataStore from {}", s3Config.getAbsolutePath());
                String pathProp = "path";
                String repoPath = props.getProperty(pathProp);
                checkNotNull(repoPath, "Missing required property [%s] from S3DataStore config loaded from [%s]",
                        pathProp, s3Config);

                //Check if 'secret' key is defined. It should be non null for references
                //to be generated. As the ref are transient we can just use any random value
                //if not specified
                String secretConfig = "secret";
                if (props.getProperty(secretConfig) == null) {
                    props.setProperty(secretConfig, UUID.randomUUID().toString());
                }

                log.info("Using {} for S3DataStore ", repoPath);
                DataStore ds = createS3DataStore(props);
                PropertiesUtil.populate(ds, toMap(props), false);
                ds.init(pathProp);
                blobStore = new DataStoreBlobStore(ds);
                closer.register(asCloseable(ds));
            }

            if (options.has(dataFileSpec)) {
                dataFile = dataFileSpec.value(options);
            }

            checkNotNull(dataFile, "Data file not configured with %s", dataFileSpec);

            if (report || extract) {
                checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath());

                binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
                if (binaryResourceProvider instanceof Closeable) {
                    closer.register((Closeable) binaryResourceProvider);
                }

                stats = new BinaryStats(tikaConfigFile, binaryResourceProvider);
                String summary = stats.getSummary();
                log.info(summary);
            }

            if (generate) {
                String src = nodeStoreSpec.value(options);
                checkNotNull(blobStore,
                        "BlobStore found to be null. FileDataStore directory " + "must be specified via %s",
                        fdsDirSpec.options());
                checkNotNull(dataFile, "Data file path not provided");
                NodeStore nodeStore = bootStrapNodeStore(src, options.has(segmentTar), blobStore, closer);
                BinaryResourceProvider brp = new NodeStoreBinaryResourceProvider(nodeStore, blobStore);
                CSVFileGenerator generator = new CSVFileGenerator(dataFile);
                generator.generate(brp.getBinaries(path));
            }

            if (extract) {
                checkNotNull(storeDir, "Directory to store extracted text content " + "must be specified via %s",
                        storeDirSpec.options());
                checkNotNull(blobStore,
                        "BlobStore found to be null. FileDataStore directory " + "must be specified via %s",
                        fdsDirSpec.options());

                DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, false);
                TextExtractor extractor = new TextExtractor(writer);

                if (options.has(poolSize)) {
                    extractor.setThreadPoolSize(poolSize.value(options));
                }

                if (tikaConfigFile != null) {
                    extractor.setTikaConfig(tikaConfigFile);
                }

                if (options.has(pathSpec)) {
                    path = pathSpec.value(options);
                }

                closer.register(writer);
                closer.register(extractor);

                extractor.setStats(stats);
                log.info("Using path {}", path);
                extractor.extract(binaryResourceProvider.getBinaries(path));

                extractor.close();
                writer.close();
            }

        } catch (Throwable e) {
            throw closer.rethrow(e);
        } finally {
            closer.close();
        }
    }

    private static Map<String, ?> toMap(Properties properties) {
        Map<String, String> map = Maps.newHashMap();
        for (final String name : properties.stringPropertyNames()) {
            map.put(name, properties.getProperty(name));
        }
        return map;
    }

    private static DataStore createS3DataStore(Properties props) throws IOException {
        S3DataStore s3ds = new S3DataStore();
        s3ds.setProperties(props);
        return s3ds;
    }

    private static Properties loadProperties(File s3Config) throws IOException {
        Properties props = new Properties();
        InputStream is = FileUtils.openInputStream(s3Config);
        try {
            props.load(is);
        } finally {
            IOUtils.closeQuietly(is);
        }
        return props;
    }

    private static NodeStore bootStrapNodeStore(String src, boolean segmentTar, BlobStore blobStore, Closer closer)
            throws IOException {
        if (src.startsWith(MongoURI.MONGODB_PREFIX)) {
            MongoClientURI uri = new MongoClientURI(src);
            if (uri.getDatabase() == null) {
                System.err.println("Database missing in MongoDB URI: " + uri.getURI());
                System.exit(1);
            }
            MongoConnection mongo = new MongoConnection(uri.getURI());
            closer.register(asCloseable(mongo));
            DocumentNodeStore store = new DocumentMK.Builder().setBlobStore(blobStore).setMongoDB(mongo.getDB())
                    .getNodeStore();
            closer.register(asCloseable(store));
            return store;
        }

        if (segmentTar) {
            return SegmentTarUtils.bootstrap(src, blobStore, closer);
        }

        return SegmentUtils.bootstrap(src, blobStore, closer);
    }

    private static Closeable asCloseable(final FileStore fs) {
        return new Closeable() {
            @Override
            public void close() throws IOException {
                fs.close();
            }
        };
    }

    private static Closeable asCloseable(final DataStore ds) {
        return new Closeable() {
            @Override
            public void close() throws IOException {
                try {
                    ds.close();
                } catch (DataStoreException e) {
                    throw new IOException(e);
                }
            }
        };
    }

    private static Closeable asCloseable(final DocumentNodeStore dns) {
        return new Closeable() {
            @Override
            public void close() throws IOException {
                dns.dispose();
            }
        };
    }

    private static Closeable asCloseable(final MongoConnection con) {
        return new Closeable() {
            @Override
            public void close() throws IOException {
                con.close();
            }
        };
    }
}