com.nridge.connector.fs.con_fs.core.FileCrawler.java Source code

Java tutorial

Introduction

Here is the source code for com.nridge.connector.fs.con_fs.core.FileCrawler.java

Source

/*
 * NorthRidge Software, LLC - Copyright (c) 2015.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package com.nridge.connector.fs.con_fs.core;

import com.nridge.connector.common.con_com.Connector;
import com.nridge.connector.common.con_com.crawl.CrawlFollow;
import com.nridge.connector.common.con_com.crawl.CrawlIgnore;
import com.nridge.connector.common.con_com.crawl.CrawlQueue;
import com.nridge.core.app.mgr.AppMgr;
import com.nridge.core.base.doc.Document;
import com.nridge.core.base.ds.DSCriteria;
import com.nridge.core.base.ds.DSException;
import com.nridge.core.base.field.Field;
import com.nridge.core.base.field.data.DataBag;
import com.nridge.core.base.field.data.DataField;
import com.nridge.core.base.io.xml.DocumentXML;
import com.nridge.core.base.std.NSException;
import com.nridge.ds.content.ds_content.Content;
import com.nridge.ds.content.ds_content.ContentExtractor;
import com.nridge.ds.solr.Solr;
import com.nridge.ds.solr.SolrDS;
import org.apache.commons.lang.time.StopWatch;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;

import java.io.File;
import java.io.IOException;
import java.nio.file.FileVisitResult;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.util.Date;
import java.util.concurrent.BlockingQueue;

/**
 * The FileCrawler uses the Visitor design pattern to traverse a
 * file system hierarchy.  Please note that not all of the overridden
 * methods are used - they are kept here in case there is need in the
 * future for overriding them.
 *
 * @see <a href="http://docs.oracle.com/javase/tutorial/essential/io/walk.html">Walking the File Tree</a>
 * @see <a href="http://www.concretepage.com/java/jdk7/traverse-directory-structure-using-files-walkfiletree-java-nio2">Traverse a Directory Structure Using Files.walkFileTree in Java NIO 2</a>
 * @see <a href="http://docs.oracle.com/javase/7/docs/api/java/util/concurrent/BlockingQueue.html">JavaDoc BlockingQueue</a>
 * @see <a href="http://www.ibm.com/developerworks/library/j-jtp05236/">Dealing with InterruptedException</a>
 */
@SuppressWarnings("unchecked")
public class FileCrawler extends SimpleFileVisitor<Path> {
    private DataBag mBag;
    private SolrDS mSolrDS;
    private final AppMgr mAppMgr;
    private CrawlFollow mCrawlFollow;
    private CrawlIgnore mCrawlIgnore;
    private boolean mIsValidationOnly;
    private boolean mIsCSVRowToDocument;
    private BlockingQueue mExtractQueue;
    private final CrawlQueue mCrawlQueue;
    private String mIdValuePrefix = StringUtils.EMPTY;

    /**
     * Creates and instance of the class and initializes it with
     * the application manager reference.  In addition, this
     * constructor will create instances of CrawlFollow and
     * CrawlIgnore objects for use during the visit process.
     *
     * @param anAppMgr Application manager instance.
     * @param aCrawlQueue Crawl queue instance.
     * @param aSolrDS Solr data source instance.
     *
     * @throws IOException Identifies an I/O error condition.
     * @throws NSException Identifies an initialization failure.
     */
    public FileCrawler(final AppMgr anAppMgr, CrawlQueue aCrawlQueue, SolrDS aSolrDS)
            throws IOException, NSException {
        super();

        mSolrDS = aSolrDS;
        mAppMgr = anAppMgr;
        mCrawlQueue = aCrawlQueue;

        mBag = (DataBag) mAppMgr.getProperty(Connector.PROPERTY_SCHEMA_NAME);

        String propertyName = Constants.CFG_PROPERTY_PREFIX + ".extract.id_value_prefix";
        String propertyValue = mAppMgr.getString(propertyName);
        if (StringUtils.isNotEmpty(propertyValue))
            mIdValuePrefix = propertyValue;

        propertyName = Constants.CFG_PROPERTY_PREFIX + ".extract.csv_row_to_document";
        mIsCSVRowToDocument = mAppMgr.getBoolean(propertyName, false);

        propertyName = Constants.CFG_PROPERTY_PREFIX + ".extract.validation_only";
        mIsValidationOnly = mAppMgr.getBoolean(propertyName, false);

        mCrawlFollow = new CrawlFollow(mAppMgr);
        mCrawlFollow.setCfgPropertyPrefix(Constants.CFG_PROPERTY_PREFIX + ".extract");
        mCrawlFollow.load();

        mCrawlIgnore = new CrawlIgnore(mAppMgr);
        mCrawlIgnore.setCfgPropertyPrefix(Constants.CFG_PROPERTY_PREFIX + ".extract");
        mCrawlIgnore.load();

        mExtractQueue = (BlockingQueue) mAppMgr.getProperty(Connector.QUEUE_EXTRACT_NAME);
    }

    /**
     * Invoked for a directory before entries in the directory are visited.
     * Unless overridden, this method returns {@link java.nio.file.FileVisitResult#CONTINUE}
     *
     * @param aDirectory Directory instance.
     * @param aFileAttributes File attribute instance.
     */
    @Override
    public FileVisitResult preVisitDirectory(Path aDirectory, BasicFileAttributes aFileAttributes)
            throws IOException {
        Logger appLogger = mAppMgr.getLogger(this, "preVisitDirectory");

        if (mAppMgr.isAlive()) {
            String pathName = aDirectory.toAbsolutePath().toString();
            if (mCrawlFollow.isMatchedNormalized(pathName)) {
                appLogger.debug(String.format("Following Path: %s", pathName));
                return FileVisitResult.CONTINUE;
            } else {
                appLogger.debug(String.format("Skipping Path: %s", pathName));
                return FileVisitResult.SKIP_SUBTREE;
            }
        } else
            return FileVisitResult.TERMINATE;
    }

    private boolean documentExistsInIndex(String aDocId) {
        Logger appLogger = mAppMgr.getLogger(this, "documentExistsInIndex");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        boolean docExists = false;
        String propertyName = Constants.CFG_PROPERTY_PREFIX + ".publish.upload_enabled";
        if (mAppMgr.getBoolean(propertyName)) {
            propertyName = Constants.CFG_PROPERTY_PREFIX + ".solr.request_uri";
            String solrURI = mAppMgr.getString(propertyName);
            if (StringUtils.isNotEmpty(solrURI)) {
                propertyName = Constants.CFG_PROPERTY_PREFIX + ".solr.request_handler";
                String propertyValue = mAppMgr.getString(propertyName, Constants.SOLR_REQUEST_HANDLER_DEFAULT);
                String requestHandler = StringUtils.removeStart(propertyValue, "/");
                String solrURL = String.format("%s/%s?q=nsd_id%%3A%s&fl=nsd_doc_hash&wt=xml&echoParams=none",
                        solrURI, requestHandler, aDocId);
                DSCriteria dsCriteria = new DSCriteria("Solr Document Exists");
                dsCriteria.add(Solr.FIELD_URL_NAME, Field.Operator.EQUAL, solrURL);
                try {
                    int docCount = mSolrDS.count(dsCriteria);
                    docExists = docCount > 0;
                    appLogger.debug(String.format("[%d] %s", docCount, solrURL));
                } catch (DSException e) {
                    appLogger.error(String.format("%s: %s", solrURL, e.getMessage()));
                }
            }
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);

        return docExists;
    }

    private String createViewURL(String aDocId) {
        Logger appLogger = mAppMgr.getLogger(this, "createViewURL");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        int portNumber = mAppMgr.getInt(Constants.CFG_PROPERTY_PREFIX + ".restlet.port_number",
                Constants.APPLICATION_PORT_NUMBER_DEFAULT);
        String propertyName = Constants.CFG_PROPERTY_PREFIX + ".restlet.host_names";
        String hostNames = mAppMgr.getString(propertyName);
        if (StringUtils.isEmpty(hostNames))
            hostNames = Constants.HOST_NAME_DEFAULT;
        else {
            if (mAppMgr.isPropertyMultiValue(propertyName)) {
                String[] hostNameList = mAppMgr.getStringArray(propertyName);
                hostNames = hostNameList[0];
            }
        }
        String docViewURL = String.format("http://%s:%d/fs/view?id=%s", hostNames, portNumber, aDocId);

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);

        return docViewURL;
    }

    private boolean isExpandableCSVFile(String aMimeType) {
        return ((mIsCSVRowToDocument) && ((StringUtils.equals(aMimeType, Content.CONTENT_TYPE_TXT_CSV))
                || (StringUtils.equals(aMimeType, Content.CONTENT_TYPE_TXT_CSV))));
    }

    private void saveAddQueueDocument(Document aDocument, StopWatch aStopWatch) throws IOException {
        Logger appLogger = mAppMgr.getLogger(this, "saveAddQueueDocument");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        if (!mIsValidationOnly) {
            DataBag dataBag = aDocument.getBag();
            DataField dataField = dataBag.getPrimaryKeyField();
            if (dataField == null)
                appLogger.error("Primary key field is missing from bag - cannot add to queue.");
            else {
                String docId = dataField.getValueAsString();

                String queueBagPathFileName = mCrawlQueue.docPathFileName(Connector.QUEUE_EXTRACT_NAME, docId);
                DocumentXML documentXML = new DocumentXML(aDocument);
                documentXML.save(queueBagPathFileName);

                aStopWatch.stop();
                String queueItem = Connector.queueItemIdPhaseTime(docId, Connector.PHASE_EXTRACT,
                        aStopWatch.getTime());
                try {
                    // If queue is full, this thread may block.
                    mExtractQueue.put(queueItem);
                } catch (InterruptedException e) {
                    // Restore the interrupted status so parent can handle (if it wants to).
                    Thread.currentThread().interrupt();
                }
            }
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);
    }

    private void processCSVFile(Path aPath, BasicFileAttributes aFileAttributes, String aViewURL)
            throws IOException {
        String docId;
        StopWatch stopWatch;
        Document fsDocument;
        Logger appLogger = mAppMgr.getLogger(this, "processCSVFile");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        File fsFile = aPath.toFile();
        String pathFileName = aPath.toAbsolutePath().toString();

        appLogger.debug(String.format("Processing CSV File: %s", pathFileName));

        CSVDocument csvDocument = new CSVDocument(mAppMgr, mBag);
        csvDocument.open(pathFileName);

        int row = 1;
        DataBag csvBag = csvDocument.extractNext();
        while (csvBag != null) {
            stopWatch = new StopWatch();
            stopWatch.start();

            docId = csvBag.generateUniqueHash(true);
            appLogger.debug(String.format(" Expanding Row [%d]: %s", row++, docId));

            csvBag.setValueByName("nsd_id", mIdValuePrefix + docId);
            csvBag.setValueByName("nsd_url", fsFile.toURI().toURL().toString());
            csvBag.setValueByName("nsd_url_view", aViewURL);
            csvBag.setValueByName("nsd_url_display", aViewURL);
            csvBag.setValueByName("nsd_file_name", fsFile.getName());
            csvBag.setValueByName("nsd_mime_type", Content.CONTENT_TYPE_TXT_CSV);
            FileTime creationTime = aFileAttributes.creationTime();
            Date cDate = new Date(creationTime.toMillis());
            csvBag.setValueByName("nsd_doc_created_ts", cDate);
            FileTime lastModifiedTime = aFileAttributes.lastModifiedTime();
            Date lmDate = new Date(lastModifiedTime.toMillis());
            csvBag.setValueByName("nsd_doc_modified_ts", lmDate);
            csvBag.setValueByName("nsd_crawl_type", mCrawlQueue.getCrawlType());
            fsDocument = new Document(Constants.FS_DOCUMENT_TYPE, csvBag);
            csvBag.setValueByName("nsd_doc_hash", fsDocument.generateUniqueHash(false));

            saveAddQueueDocument(fsDocument, stopWatch);

            csvBag = csvDocument.extractNext();
        }

        csvDocument.close();

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);
    }

    private String generateDocumentId(Path aPath) {
        String pathFileName = aPath.toAbsolutePath().toString();
        return mIdValuePrefix + Content.hashId(pathFileName);
    }

    private void processFile(Path aPath, BasicFileAttributes aFileAttributes) throws IOException {
        Logger appLogger = mAppMgr.getLogger(this, "processFile");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        StopWatch stopWatch = new StopWatch();
        stopWatch.start();

        File fsFile = aPath.toFile();
        String docId = generateDocumentId(aPath);
        String pathFileName = aPath.toAbsolutePath().toString();
        appLogger.debug(String.format("Processing File (%s): %s", docId, pathFileName));

        boolean isFileFlat = true;
        Document fsDocument = new Document(Constants.FS_DOCUMENT_TYPE, mBag);
        DataBag fileBag = fsDocument.getBag();
        fileBag.resetValuesWithDefaults();
        fileBag.setValueByName("nsd_id", docId);
        String fileName = fsFile.getName();
        fileBag.setValueByName("nsd_url", fsFile.toURI().toURL().toString());
        String viewURL = createViewURL(docId);
        fileBag.setValueByName("nsd_url_view", viewURL);
        fileBag.setValueByName("nsd_url_display", viewURL);
        fileBag.setValueByName("nsd_name", fileName);
        fileBag.setValueByName("nsd_file_name", fileName);
        fileBag.setValueByName("nsd_file_size", aFileAttributes.size());
        FileTime creationTime = aFileAttributes.creationTime();
        Date cDate = new Date(creationTime.toMillis());
        fileBag.setValueByName("nsd_doc_created_ts", cDate);
        FileTime lastModifiedTime = aFileAttributes.lastModifiedTime();
        Date lmDate = new Date(lastModifiedTime.toMillis());
        fileBag.setValueByName("nsd_doc_modified_ts", lmDate);
        fileBag.setValueByName("nsd_crawl_type", mCrawlQueue.getCrawlType());

        DataField dataField = fileBag.getFirstFieldByFeatureName(Field.FEATURE_IS_CONTENT);
        if (dataField != null) {
            ContentExtractor contentExtractor = new ContentExtractor(mAppMgr);
            contentExtractor.setCfgPropertyPrefix(Constants.CFG_PROPERTY_PREFIX + ".extract");
            try {
                String mimeType = contentExtractor.detectType(fsFile);
                if (StringUtils.isNotEmpty(mimeType))
                    fileBag.setValueByName("nsd_mime_type", mimeType);
                if (isExpandableCSVFile(mimeType)) {
                    isFileFlat = false;
                    processCSVFile(aPath, aFileAttributes, viewURL);
                } else
                    contentExtractor.process(pathFileName, dataField);
            } catch (NSException e) {
                String msgStr = String.format("%s: %s", pathFileName, e.getMessage());
                appLogger.error(msgStr);
            }
        }

        if (isFileFlat) {
            fileBag.setValueByName("nsd_doc_hash", fsDocument.generateUniqueHash(false));
            saveAddQueueDocument(fsDocument, stopWatch);
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);
    }

    /**
     * Invoked for a file in a directory.
     * Unless overridden, this method returns {@link java.nio.file.FileVisitResult#CONTINUE
     * CONTINUE}.
     *
     * @param aPath Path instance.
     * @param aFileAttributes File attribute instance.
     */
    @Override
    public FileVisitResult visitFile(Path aPath, BasicFileAttributes aFileAttributes) throws IOException {
        Logger appLogger = mAppMgr.getLogger(this, "visitFile");

        String pathFileName = aPath.toAbsolutePath().toString();
        if (mCrawlIgnore.isMatchedNormalized(pathFileName))
            appLogger.debug(String.format("Ignoring File: %s", pathFileName));
        else {
            File fsFile = aPath.toFile();
            if ((fsFile.canRead()) && (mBag != null)) {
                String crawlType = mCrawlQueue.getCrawlType();
                if (StringUtils.equals(crawlType, Connector.CRAWL_TYPE_INCREMENTAL)) {
                    String docId = generateDocumentId(aPath);
                    boolean docExistsInIndex = documentExistsInIndex(docId);
                    if (docExistsInIndex) {
                        Date incDate = mCrawlQueue.getCrawlLastModified();
                        FileTime lastModifiedTime = aFileAttributes.lastModifiedTime();
                        Date lmDate = new Date(lastModifiedTime.toMillis());
                        if (lmDate.after(incDate))
                            processFile(aPath, aFileAttributes);
                    } else
                        processFile(aPath, aFileAttributes);
                } else
                    processFile(aPath, aFileAttributes);
            } else
                appLogger.warn(String.format("Access Failed: %s", pathFileName));
        }

        if (mAppMgr.isAlive())
            return FileVisitResult.CONTINUE;
        else
            return FileVisitResult.TERMINATE;
    }

    /**
     * Invoked for a file that could not be visited.
     * Unless overridden, this method re-throws the I/O exception that prevented
     * the file from being visited.
     *
     * @param aPathFile Path file instance.
     * @param anException Identifies an I/O error condition.
     */
    @Override
    public FileVisitResult visitFileFailed(Path aPathFile, IOException anException) throws IOException {
        Logger appLogger = mAppMgr.getLogger(this, "visitFileFailed");

        String pathFileName = aPathFile.toAbsolutePath().toString();
        appLogger.warn(String.format("%s: %s", pathFileName, anException.getMessage()));

        return FileVisitResult.CONTINUE;
    }

    /**
     * Invoked for a directory after entries in the directory, and all of their
     * descendants, have been visited.
     * Unless overridden, this method returns {@link java.nio.file.FileVisitResult#CONTINUE}
     * if the directory iteration completes without an I/O exception;
     * otherwise this method re-throws the I/O exception that caused the iteration
     * of the directory to terminate prematurely.
     *
     * @param aDirectory Directory instance.
     * @param anException Identifies an I/O error condition.
     */
    @Override
    public FileVisitResult postVisitDirectory(Path aDirectory, IOException anException) throws IOException {
        return super.postVisitDirectory(aDirectory, anException);
    }
}