com.edduarte.vokter.document.DocumentBuilder.java Source code

Introduction

Here is the source code for com.edduarte.vokter.document.DocumentBuilder.java
Source

/*
 * Copyright 2015 Eduardo Duarte
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.edduarte.vokter.document;

import com.edduarte.vokter.parser.Parser;
import com.edduarte.vokter.parser.ParserPool;
import com.edduarte.vokter.util.OSGiManager;
import com.google.common.base.Stopwatch;
import com.mongodb.DB;
import com.optimaize.langdetect.LanguageDetector;
import org.apache.tools.ant.filters.StringInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.mail.internet.ContentType;
import javax.mail.internet.ParseException;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.function.Supplier;

/**
 * Builder class that loads documents streams and indexes them into a
 * {@link DocumentCollection} structure.
 * <p>
 * This class is a merge of the CorpusLoader class and the Processor classes from
 * the previous assignment.
 *
 * @author Eduardo Duarte (<a href="mailto:hello@edduarte.com">hello@edduarte.com</a>)
 * @version 1.3.2
 * @since 1.0.0
 */
public final class DocumentBuilder {

    private static final Logger logger = LoggerFactory.getLogger(DocumentBuilder.class);

    /**
     * The low-footprint loader of the document, using a lazy stream.
     */
    private final Supplier<DocumentInput> documentLazySupplier;

    /**
     * The language detector that will assure that the right Stopword filter
     * and Stemmer are used for the input content.
     */
    private LanguageDetector langDetector;

    /**
     * Flag that sets usage of stopword filtering.
     */
    private boolean isStoppingEnabled = false;

    /**
     * Flag that sets usage of a porter stemmer.
     */
    private boolean isStemmingEnabled = false;

    /**
     * Flag that sets matching of equal occurrences with different casing.
     */
    private boolean ignoreCase = false;

    private DocumentBuilder(final Supplier<DocumentInput> documentLazySupplier) {
        this.documentLazySupplier = documentLazySupplier;
    }

    /**
     * Instantiates a loader that collects a document from a
     * specified web url, by fetching the content as a InputStream and the content
     * format.
     */
    public static DocumentBuilder fromUrl(final String url) {
        return new DocumentBuilder(() -> {
            try {
                URL urlToFetch = new URL(url);

                HttpURLConnection.setFollowRedirects(true);
                HttpURLConnection connection = (HttpURLConnection) urlToFetch.openConnection();
                connection.setConnectTimeout(5000);
                connection.setReadTimeout(5000);

                InputStream contentStream = new BufferedInputStream(connection.getInputStream());
                ContentType contentType = new ContentType(connection.getContentType());
                return new DocumentInput(url, contentStream, contentType.getBaseType());

            } catch (IOException | ParseException ex) {
                throw new RuntimeException(ex);
            }
        });
    }

    /**
     * Instantiates a loader that collects a document from a
     * specified input stream. This constructor is mostly used for testing.
     */
    public static DocumentBuilder fromString(final String url, final String text, final String type) {
        return new DocumentBuilder(() -> {
            try {
                ContentType contentType = new ContentType(type);
                return new DocumentInput(url, new StringInputStream(text), contentType.getBaseType());

            } catch (ParseException ex) {
                throw new RuntimeException(ex);
            }
        });
    }

    public DocumentBuilder withLanguageDetector(LanguageDetector langDetector) {
        this.langDetector = langDetector;
        return this;
    }

    public DocumentBuilder withStopwords() {
        this.isStoppingEnabled = true;
        return this;
    }

    public DocumentBuilder withStemming() {
        this.isStemmingEnabled = true;
        return this;
    }

    public DocumentBuilder ignoreCase() {
        this.ignoreCase = true;
        return this;
    }

    /**
     * Indexes the documents specified in the factory method and adds the index
     * files into the specified folder.
     * <p>
     * This method will perform all tasks associated with reading a corpus,
     * processing and indexing it, writing the results to disk persistence and
     * building cached systems that provide synchronous access to documents and
     * tokens.
     * <p>
     * The most recently accessed tokens and documents are kept in memory for 20
     * seconds before being destroyed. If a token and a document are not in cache,
     * the relevant data is read and parsed from the local files.
     *
     * @return the built index of the documents specified in the factory method
     */
    public Document build(DB occurrencesDB, ParserPool parserPool) {
        Stopwatch sw = Stopwatch.createStarted();

        // step 1) Perform a lazy loading of the document, by obtaining its url,
        // content stream and content type.
        DocumentInput input = documentLazySupplier.get();

        // step 2) Checks if the input document is supported by the server
        boolean isSupported = OSGiManager.getCompatibleReader(input.getContentType()) != null;
        if (!isSupported) {
            logger.info("Ignored processing document '{}': No compatible readers available for content-type '{}'.",
                    input.getUrl(), input.getContentType());
            return null;
        }

        // step 3) Takes a parser from the parser-pool.
        Parser parser;
        try {
            parser = parserPool.take();
        } catch (InterruptedException ex) {
            logger.error(ex.getMessage(), ex);
            return null;
        }

        // step 4) Build a processing instruction to be executed.
        //         A pipeline instantiates a new object for each of the
        //         required modules, improving performance of parallel jobs.
        DocumentPipeline pipeline = new DocumentPipeline(

                // the language detection model
                langDetector,

                // general structure that holds the created occurrences
                occurrencesDB,

                // the input document info, including its path and InputStream
                input,

                // parser that will be used for document parsing and occurrence
                // detection
                parser,

                // flag that sets that stopwords will be filtered during
                // tokenization
                isStoppingEnabled,

                // flag that sets that every found occurrence during tokenization will
                // be stemmer
                isStemmingEnabled,

                // flag that forces every found token to be lower case, matching,
                // for example, the words 'be' and 'Be' as the same token
                ignoreCase);

        // step 5) Process the document asynchronously.
        Document document;
        try {
            document = pipeline.call();
        } catch (Exception ex) {
            logger.error(ex.getMessage(), ex);
            return null;
        }

        // step 6) Place the parser back in the parser-pool.
        try {
            parserPool.place(parser);
        } catch (InterruptedException ex) {
            logger.error(ex.getMessage(), ex);
            return null;
        }

        sw.stop();
        logger.info("Completed processing document '{}' in {}.", document.getUrl(), sw.toString());

        return document;
    }
}