com.shmsoft.dmass.main.FileProcessor.java Source code

Introduction

Here is the source code for com.shmsoft.dmass.main.FileProcessor.java
Source

/*
 *
 * Copyright SHMsoft, Inc. 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/
package com.shmsoft.dmass.main;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.metadata.Metadata;
import com.shmsoft.dmass.data.index.LuceneIndex;
import com.shmsoft.dmass.data.index.SolrIndex;
import com.shmsoft.dmass.mail.EmailProperties;
import com.shmsoft.dmass.ocr.OCRProcessor;
import com.shmsoft.dmass.print.OfficePrint;
import com.shmsoft.dmass.services.Util;
import com.shmsoft.dmass.services.History;
import com.shmsoft.dmass.services.Project;
import com.shmsoft.dmass.services.Stats;

/**
 * Opens the file, creates Lucene index and searches, then updates Hadoop map
 */
public abstract class FileProcessor {

    private String zipFileName;
    private String singleFileName;
    private Context context; // Hadoop processing result context
    protected int docCount;
    private LuceneIndex luceneIndex;

    public String getZipFileName() {
        return zipFileName;
    }

    public String getSingleFileName() {
        return singleFileName;
    }

    public Context getContext() {
        return context;
    }

    public LuceneIndex getLuceneIndex() {
        return luceneIndex;
    }

    /**
     * Zip files are the initial file format passed to Hadoop map step
     *
     * @param zipFileName
     */
    public void setZipFileName(String zipFileName) {
        this.zipFileName = zipFileName;
    }

    // TODO method not used in project
    public void setSingleFileName(String singleFileName) {
        this.singleFileName = singleFileName;
    }

    /**
     * Constructor
     *
     * @param context Set Hadoop processing context
     */
    public FileProcessor(Context context, LuceneIndex luceneIndex) {
        this.context = context;
        this.luceneIndex = luceneIndex;
    }

    /**
     * @throws IOException
     * @throws InterruptedException
     */
    abstract public void process() throws IOException, InterruptedException;

    /**
     * Cull, then emit responsive files
     *
     * @param tempFile Temporary uncompressed file on disk
     * @param originalFileName Original file name
     * @throws IOException
     * @throws InterruptedException
     */
    protected void processFileEntry(String tempFile, String originalFileName)
            throws IOException, InterruptedException {
        Project project = Project.getProject();
        if (project.checkSkip()) {
            return;
        }
        project.incrementCurrentMapCount();
        if (!project.isMapCountWithinRange()) {
            return;
        }
        // update application log
        History.appendToHistory("FileProcess.processFileEntry: " + originalFileName);
        // set to true if file matches any query params
        boolean isResponsive = false;
        // exception message to place in output if error occurs
        String exceptionMessage = null;
        // Document metadata, derived from Tika metadata class
        DocumentMetadata metadata = new DocumentMetadata();
        try {
            metadata.set(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH,
                    getOriginalDocumentPath(tempFile, originalFileName));
            // extract file contents with Tika
            // Tika metadata class contains references to metadata and file text
            extractMetadata(tempFile, metadata, originalFileName);
            if (project.isRemoveSystemFiles() && Util.isSystemFile(metadata)) {
                // TODO should we log denisting?
                return;
            }
            metadata.set(DocumentMetadataKeys.CUSTODIAN, project.getCurrentCustodian());
            // search through Tika results using Lucene
            isResponsive = isResponsive(metadata);
        } catch (Exception e) {
            e.printStackTrace(System.out);
            History.appendToHistory("Exception: " + e.getMessage());
            exceptionMessage = e.getMessage();
        }
        // update exception message if error
        if (exceptionMessage != null) {
            metadata.set(DocumentMetadataKeys.PROCESSING_EXCEPTION, exceptionMessage);
        }
        if (isResponsive || exceptionMessage != null) {
            createImage(tempFile, metadata, originalFileName);
            emitAsMap(tempFile, metadata, originalFileName);
        }
        History.appendToHistory("Responsive: " + isResponsive);
    }

    private boolean isPdf() {
        return Project.getProject().isCreatePDF();
    }

    private void createImage(String fileName, Metadata metadata, String originalFileName) {
        if (isPdf()) {
            OfficePrint.getInstance().createPdf(fileName, fileName + ".pdf", originalFileName);
        }
    }

    /**
     * Add the search result (Tika metadata) to Hadoop context as a map Key is
     * the MD5 of the file used to create map
     *
     * @param fileName Filename of file search performed on
     * @param metadata Metadata extracted from search
     * @throws IOException
     * @throws InterruptedException
     */
    @SuppressWarnings("unchecked")
    private void emitAsMap(String fileName, Metadata metadata, String originalFileName)
            throws IOException, InterruptedException {
        MapWritable mapWritable = createMapWritable(metadata, fileName);
        //create the hash for this type of file
        MD5Hash key = createKeyHash(fileName, metadata, originalFileName);
        // emit map
        if (PlatformUtil.isNix()) {
            context.write(key, mapWritable);
            context.progress();
        } else {
            ArrayList<MapWritable> values = new ArrayList<>();
            values.add(mapWritable);
            WindowsReduce.getInstance().reduce(key, values, null);
        }
        // update stats
        Stats.getInstance().increaseItemCount();
    }

    public static MD5Hash createKeyHash(String fileName, Metadata metadata, String originalFileName)
            throws IOException {
        String extension = Util.getExtension(originalFileName);

        if ("eml".equalsIgnoreCase(extension)) {
            String hashNames = EmailProperties.getInstance().getProperty(EmailProperties.EMAIL_HASH_NAMES);
            String[] hashNamesArr = hashNames.split(",");

            StringBuffer data = new StringBuffer();

            for (String hashName : hashNamesArr) {
                String value = metadata.get(hashName);
                if (value != null) {
                    data.append(value);
                    data.append(" ");
                }
            }

            return MD5Hash.digest(data.toString());
        } else {
            //use MD5 of the input file as Hadoop key      
            FileInputStream fileInputStream = new FileInputStream(fileName);
            MD5Hash key = MD5Hash.digest(fileInputStream);
            fileInputStream.close();

            return key;
        }
    }

    /**
     * Create a map
     *
     * @param metadata Hadoop metadata to insert into map
     * @param fileName File currently in process
     * @return Created map
     * @throws IOException
     */
    private MapWritable createMapWritable(Metadata metadata, String fileName) throws IOException {
        MapWritable mapWritable = new MapWritable();
        String[] names = metadata.names();
        for (String name : names) {
            mapWritable.put(new Text(name), new Text(metadata.get(name)));
        }
        byte[] bytes = new File(fileName).length() < ParameterProcessing.ONE_GIG ? Util.getFileContent(fileName)
                : "File too large".getBytes();
        mapWritable.put(new Text(ParameterProcessing.NATIVE), new BytesWritable(bytes));

        if (isPdf()) {
            String pdfFileName = fileName + ".pdf";
            if (new File(pdfFileName).exists()) {
                byte[] pdfBytes = Util.getFileContent(pdfFileName);
                mapWritable.put(new Text(ParameterProcessing.NATIVE_AS_PDF), new BytesWritable(pdfBytes));
            }
        }
        return mapWritable;
    }

    /**
     * Search metadata and file contents
     *
     * @param metadata
     * @return true if match is found else false
     */
    private boolean isResponsive(Metadata metadata) {
        // set true if search finds a match
        boolean isResponsive = false;

        // get culling parameters
        String queryString = Project.getProject().getCullingAsTextBlock();

        // TODO parse important parameters to mappers and reducers individually, not globally
        IndexWriter writer = null;
        RAMDirectory idx = null;
        try {
            // construct a RAMDirectory to hold the in-memory representation of the index.
            idx = new RAMDirectory();

            // make a writer to create the index
            writer = new IndexWriter(idx, new StandardAnalyzer(Version.LUCENE_30), true,
                    IndexWriter.MaxFieldLength.UNLIMITED);

            writer.addDocument(createDocument(metadata));

            // optimize and close the writer to finish building the index
            writer.optimize();
            writer.close();

            //adding the build index to FS
            if (Project.getProject().isLuceneFSIndexEnabled() && luceneIndex != null) {
                luceneIndex.addToIndex(idx);
            }

            SolrIndex.getInstance().addBatchData(metadata);

            if (queryString == null || queryString.trim().isEmpty()) {
                return true;
            }

            // build an IndexSearcher using the in-memory index
            Searcher searcher = new IndexSearcher(idx);
            // search directory
            isResponsive = search(searcher, queryString);

            searcher.close();
        } catch (Exception e) {
            // TODO handle this better
            // if anything happens - don't stop processing
            e.printStackTrace(System.out);
        } finally {
            try {
                if (writer != null) {
                    writer.close();
                }
                if (idx != null) {
                    idx.close();
                }
            } catch (Exception e) {
                // swallow exception, what else can you do now?
            }
        }
        return isResponsive;
    }

    /**
     * Create Apache Lucene document
     *
     * @param title Title of document
     * @param content Document contents
     * @return Lucene document
     */
    private static Document createDocument(Metadata metadata) {
        // add some Document objects containing quotes
        String title = metadata.get(ParameterProcessing.TITLE);
        // TODO - where is my title?
        if (title == null) {
            title = "";
        }

        String content = metadata.get(DocumentMetadataKeys.DOCUMENT_TEXT);

        Document doc = new Document();
        doc.add(new Field(ParameterProcessing.TITLE, title.toLowerCase(), Field.Store.YES, Field.Index.ANALYZED));
        if (content != null) {
            doc.add(new Field(ParameterProcessing.CONTENT, content.toLowerCase(), Field.Store.NO,
                    Field.Index.ANALYZED));
        }

        //add all metadata fields
        String[] metadataNames = metadata.names();
        for (String name : metadataNames) {
            String data = metadata.get(name);
            doc.add(new Field(name, data.toLowerCase(), Field.Store.YES, Field.Index.ANALYZED));
        }

        return doc;
    }

    /**
     * Search for query
     *
     * @param searcher Lucene index
     * @param queryString What to search for
     * @return True if matches found, else False
     * @throws ParseException
     * @throws IOException
     */
    private static boolean search(Searcher searcher, String queryString) throws ParseException, IOException {
        // explode search string input string into OR search
        String parsedQuery = parseQueryString(queryString);
        // Lucene query parser
        QueryParser queryParser = new QueryParser(Version.LUCENE_30, "content",
                new StandardAnalyzer(Version.LUCENE_30));
        if (parsedQuery.length() == 0) {
            return true;
        } else {
            // Build a Query object
            Query query = queryParser.parse(parsedQuery);
            // Search for the query
            TopDocs topDocs = searcher.search(query, 1);
            return topDocs.totalHits > 0;
        }
    }

    /**
     * Add OR statements to search input
     *
     * @param queryString
     * @return
     */
    private static String parseQueryString(String queryString) {
        StringBuilder query = new StringBuilder();
        String[] strings = queryString.split("\n");
        for (int i = 0; i < strings.length; ++i) {
            String string = strings[i];
            query.append(string);
            if (i < strings.length - 1) {
                query.append(" OR ");
            }
        }
        return query.toString();
    }

    /**
     * Extracts document metadata. Text is part of it. Forensics information is
     * part of it.
     *
     * @param tempFile
     * @return DocumentMetadata
     */
    private void extractMetadata(String tempFile, DocumentMetadata metadata, String originalFileName) {
        DocumentParser.getInstance().parse(tempFile, metadata, originalFileName);
        //System.out.println(Util.toString(metadata));

        //OCR processing
        if (Project.getProject().isOcrEnabled()) {
            OCRProcessor ocrProcessor = OCRProcessor.createProcessor(ParameterProcessing.OCR_OUTPUT, context);
            List<String> images = ocrProcessor.getImageText(tempFile);

            if (images != null && images.size() > 0) {
                StringBuilder allContent = new StringBuilder();

                String documentContent = metadata.get(DocumentMetadataKeys.DOCUMENT_TEXT);
                allContent.append(documentContent);

                for (String image : images) {
                    allContent.append(System.getProperty("line.separator")).append(image);
                }

                metadata.set(DocumentMetadataKeys.DOCUMENT_TEXT, allContent.toString());
            }
        }
    }

    abstract String getOriginalDocumentPath(String tempFile, String originalFileName);
}