indexer.LuceneIndexerAddDocument.java Source code

Java tutorial

Introduction

Here is the source code for indexer.LuceneIndexerAddDocument.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package indexer;

import com.Global;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.attribute.BasicFileAttributes;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.epub.EpubParser;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.parser.odf.OpenDocumentParser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.rtf.RTFParser;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

/**
 *
 * @author Andrew
 */
public class LuceneIndexerAddDocument {

    final static Logger log = Logger.getLogger(LuceneIndexerAddDocument.class);

    /**
     * Indexes a single document with the aid of Apache Tika.
     *
     * @param writer Writer to the index where the given file/dir info will be
     * stored.
     * @param file The file to index, or the directory to recurse into to find
     * files to index.
     * @param attrs This is the attributes from the given file gathered from
     * walking the file tree.
     * @param global This is for reference to the global class variables and
     * methods.
     * @throws IOException
     */
    static void indexDoc(IndexWriter writer, Path file, BasicFileAttributes attrs, Global global)
            throws IOException {
        File document = file.toFile();
        if (document.renameTo(document)) {
            try (InputStream stream = Files.newInputStream(file)) {

                //make a new, empty document
                Document doc = new Document();

                //Add the path of the file as a field named "path".
                Field pathField = new StringField("path", file.toString(), Field.Store.YES);
                doc.add(pathField);

                //Add the last modified date of the file as a field named "modified".
                doc.add(new LongField("modified", attrs.lastModifiedTime().toMillis(), Field.Store.YES));

                //Add the created date of the file as a field named "created".
                doc.add(new LongField("created", attrs.creationTime().toMillis(), Field.Store.YES));

                //Add the document File Name
                doc.add(new StringField("filename", file.getFileName().toString(), Field.Store.YES));

                //Add the contents of the file as a field named "vcontents". 
                //Parser type for Tika
                BodyContentHandler handler = new BodyContentHandler(global.WRITE_LIMIT);
                Metadata metadata = new Metadata();
                FileInputStream inputstream = new FileInputStream(new File(file.toString()));
                ParseContext pcontext = new ParseContext();

                //New Field Type
                FieldType bodyType = new FieldType();
                bodyType.setStored(true);
                bodyType.setTokenized(true);
                // for Highlighter, FastvectorHighlighter
                bodyType.setStoreTermVectors(true);
                bodyType.setStoreTermVectorPositions(true);
                bodyType.setStoreTermVectorOffsets(true);
                // for PostingsHighlighter
                bodyType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);

                /**
                 * Determine the document type and the proper parser for the
                 * document After the document is determined we grab the content
                 * and position offset for highlighting.
                 */
                try {
                    if (file.toString().endsWith(".pdf")) {
                        PDFParser pdfparser = new PDFParser();
                        pdfparser.parse(inputstream, handler, metadata, pcontext);
                        doc.add(new Field("vcontent", handler.toString(), bodyType));
                    } else if (file.toString().endsWith(".docx") || file.toString().endsWith(".pptx")
                            || file.toString().endsWith(".xlsx") || file.toString().endsWith(".docm")
                            || file.toString().endsWith(".pptm") || file.toString().endsWith(".xlsm")) {
                        OOXMLParser msofficeparser = new OOXMLParser();
                        msofficeparser.parse(inputstream, handler, metadata, pcontext);
                        doc.add(new Field("vcontent", handler.toString(), bodyType));
                    } else if (file.toString().endsWith(".doc") || file.toString().endsWith(".ppt")
                            || file.toString().endsWith(".xlx")) {
                        OfficeParser msofficeparser = new OfficeParser();
                        msofficeparser.parse(inputstream, handler, metadata, pcontext);
                        doc.add(new Field("vcontent", handler.toString(), bodyType));
                    } else if (file.toString().endsWith(".odt") || file.toString().endsWith(".odp")
                            || file.toString().endsWith(".ods")) {
                        OpenDocumentParser openofficeparser = new OpenDocumentParser();
                        openofficeparser.parse(inputstream, handler, metadata, pcontext);
                        doc.add(new Field("vcontent", handler.toString(), bodyType));
                    } else if (file.toString().endsWith(".epub")) {
                        EpubParser epubParser = new EpubParser();
                        epubParser.parse(inputstream, handler, metadata, pcontext);
                        doc.add(new Field("vcontent", handler.toString(), bodyType));
                    } else if (file.toString().endsWith(".xml")) {
                        XMLParser XMLparser = new XMLParser();
                        XMLparser.parse(inputstream, handler, metadata, pcontext);
                        doc.add(new Field("vcontent", handler.toString(), bodyType));
                    } else if (file.toString().endsWith(".htm") || file.toString().endsWith(".html")
                            || file.toString().endsWith(".mhtml")) {
                        HtmlParser HTMLparser = new HtmlParser();
                        HTMLparser.parse(inputstream, handler, metadata, pcontext);
                        doc.add(new Field("vcontent", handler.toString(), bodyType));
                    } else if (file.toString().endsWith(".rtf")) {
                        RTFParser RTFparser = new RTFParser();
                        RTFparser.parse(inputstream, handler, metadata, pcontext);
                        doc.add(new Field("vcontent", handler.toString(), bodyType));
                    } else if (file.toString().endsWith(".txt")) {
                        TXTParser TXTparser = new TXTParser();
                        TXTparser.parse(inputstream, handler, metadata, pcontext);
                        doc.add(new Field("vcontent", handler.toString(), bodyType));
                    } else {
                        BufferedReader buffedRead = new BufferedReader(
                                new InputStreamReader(stream, StandardCharsets.UTF_8));
                        doc.add(new TextField("vcontent", buffedRead));
                    }
                } catch (SAXException | TikaException ex) {
                    log.fatal("Document Parsing Exception");
                }

                if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    writer.addDocument(doc);
                    System.out.println("adding " + file);
                } else {
                    /**
                     * Existing index (an old copy of this document may have
                     * been indexed) so we use updateDocument instead to replace
                     * the old one matching the exact path, if present:
                     */
                    writer.updateDocument(new Term("path", file.toString()), doc);
                    System.out.println("updating " + file);
                }
            }
        } else {
            System.out.println("LOCKED: " + file);
        }
    }

}