de.maklerpoint.office.Lucene.Indexer.java Source code

Java tutorial

Introduction

Here is the source code for de.maklerpoint.office.Lucene.Indexer.java

Source

/*
 * g5c Project
 *
 * Copyright (C) 2010 g5c. All Rights Reserved.
 * http://www.g5c.org
 *
 * This file is part of org.g5c
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. See File LICENSE. If not, see
 * <http://www.gnu.org/licenses/>.
 */
package de.maklerpoint.office.Lucene;

import de.maklerpoint.office.Database.DatabaseConnection;
import de.maklerpoint.office.Filesystem.Filesystem;
import de.maklerpoint.office.Konstanten.FileTypes;
import de.maklerpoint.office.Logging.Log;
import de.maklerpoint.office.System.Configuration.DatabaseConfig;
import de.maklerpoint.office.Tools.FormatFileSize;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LimitTokenCountAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.openide.util.Exceptions;

/**
 *
 * @author Yves Hoppe <hoppe at maklerpoint.de>
 */
public class Indexer extends Thread {

    private IndexWriter writer;
    private IndexReader reader; // TODO Implement updater, atm n
    private ArrayList<File> queue = new ArrayList<File>();
    private String indexDir = Filesystem.getRootPath() + File.separator + "sonstiges" + File.separator + "index";
    private SimpleDateFormat df = new SimpleDateFormat("dd.MM.yyyy HH:mm");

    public Indexer() {
        super();
    }

    @Override
    public void run() {
        try {
            Analyzer an = new StandardAnalyzer(Version.LUCENE_32);
            FSDirectory dir = FSDirectory.open(new File(indexDir));

            IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_32, an);
            LimitTokenCountAnalyzer lt = new LimitTokenCountAnalyzer(an, Integer.MAX_VALUE);
            conf.setOpenMode(OpenMode.CREATE); // Immer neuer INdex

            writer = new IndexWriter(dir, conf);

            int originalNumDocs = writer.numDocs();
            Log.logger.info("Anzahl indexierte Dateien: " + originalNumDocs);

            //            indexFileorDir(Filesystem.getRootPath());
            indexDatabase();

            int newNumDocs = writer.numDocs();
            Log.logger.info((newNumDocs - originalNumDocs) + " neue Dokumente indexiert.");

            writer.close();
        } catch (Exception ex) {
            //            Exceptions.printStackTrace(ex);
        } finally {
            if (writer != null) {
                try {
                    writer.close();
                } catch (CorruptIndexException ex) {
                    Exceptions.printStackTrace(ex);
                } catch (IOException ex) {
                    Exceptions.printStackTrace(ex);
                }
            }
        }
    }

    private void indexFileorDir(String fileName) throws IOException {
        listFiles(new File(fileName));

        for (File f : queue) {
            FileReader fr = null;
            try {
                if (f.getName().startsWith(".")) {
                    //                    System.out.println("Versteckte datei: " + f.getName());
                    // TODO add html, xml parsers
                } else if (f.getName().endsWith(".htm") || f.getName().endsWith(".html")
                        || f.getName().endsWith(".xml") || f.getName().endsWith(".txt")) {
                    Document doc = new Document();

                    //===================================================
                    // add contents of file
                    //===================================================
                    fr = new FileReader(f);
                    doc.add(new Field("contents", fr));

                    //===================================================
                    //adding second field which contains the path of the file
                    //===================================================
                    doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                    /**
                     * Adding Typ
                     */
                    doc.add(new Field("type", String.valueOf(FileTypes.TXT), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("filesize",
                            String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                            Field.Store.YES, Field.Index.NOT_ANALYZED));

                    writer.addDocument(doc);
                } else if (f.getName().endsWith(".pdf")) {
                    PDFParser parser = new PDFParser(new FileInputStream(f));
                    parser.parse();
                    COSDocument cd = parser.getDocument();
                    PDFTextStripper stripper = new PDFTextStripper();

                    String text = stripper.getText(new PDDocument(cd));

                    Document doc = new Document();

                    doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED));
                    doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                    doc.add(new Field("type", String.valueOf(FileTypes.PDF), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("filesize",
                            String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                            Field.Store.YES, Field.Index.NOT_ANALYZED));

                    writer.addDocument(doc);
                    cd.close();
                } else if (f.getName().endsWith(".doc") || f.getName().endsWith(".docx")) {

                    POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
                    WordExtractor extractor = new WordExtractor(fs);
                    String wordText = extractor.getText();

                    Document doc = new Document();
                    doc.add(new Field("contents", wordText, Field.Store.YES, Field.Index.ANALYZED));

                    doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                    doc.add(new Field("type", String.valueOf(FileTypes.DOC), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("filesize",
                            String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                            Field.Store.YES, Field.Index.NOT_ANALYZED));

                    writer.addDocument(doc);
                } else if (f.getName().endsWith(".xls") || f.getName().endsWith(".xlsx")) {
                    POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
                    ExcelExtractor extractor = new ExcelExtractor(fs);
                    String excelText = extractor.getText();

                    Document doc = new Document();
                    doc.add(new Field("contents", excelText, Field.Store.YES, Field.Index.ANALYZED));

                    doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                    doc.add(new Field("type", String.valueOf(FileTypes.XLS), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("filesize",
                            String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                            Field.Store.YES, Field.Index.NOT_ANALYZED));

                    writer.addDocument(doc);
                } else if (f.getName().endsWith(".ppt") || f.getName().endsWith(".pptx")) {
                    POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
                    PowerPointExtractor extractor = new PowerPointExtractor(fs);
                    String ppttext = extractor.getText();

                    Document doc = new Document();
                    doc.add(new Field("contents", ppttext, Field.Store.YES, Field.Index.ANALYZED));

                    doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));

                    doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("type", String.valueOf(FileTypes.PPT), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    doc.add(new Field("filesize",
                            String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                            Field.Store.YES, Field.Index.NOT_ANALYZED));

                    writer.addDocument(doc);
                }

                if (Log.logger.isDebugEnabled()) {
                    Log.logger.debug("Lucene | Neue Datei indexiert: " + f);
                }
            } catch (Exception e) {
                if (Log.logger.isDebugEnabled()) {
                    Log.logger.debug("Datei konnte nicht indexiert werden: " + f, e);
                }
                continue;
            } finally {
                //                fr.close();
            }
        }

        writer.optimize();
        queue.clear();

    }

    private void indexDatabase() throws SQLException, CorruptIndexException, IOException {
        for (int i = 0; i < DatabaseConfig.TABLES_INDEX.length; i++) {
            Connection con = DatabaseConnection.open();
            String sql = "SELECT * FROM " + DatabaseConfig.TABLES_INDEX[i];
            Statement stmt = con.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_READ_ONLY);
            ResultSet rs = stmt.executeQuery(sql);
            ResultSetMetaData rsMetaData = rs.getMetaData();

            int columnCount = rsMetaData.getColumnCount();
            while (rs.next()) {
                int id = rs.getInt("id");
                String modified = df.format(rs.getTimestamp("modified"));

                for (int j = 1; j <= columnCount; j++) {
                    Document d = new Document();
                    d.add(new Field("dbid", String.valueOf(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
                    d.add(new Field("path", rsMetaData.getColumnName(j), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    if (rs.getString(j) != null) {
                        d.add(new Field("contents", rs.getString(j), Field.Store.YES, Field.Index.ANALYZED));
                    }

                    d.add(new Field("type", String.valueOf(FileTypes.DATABASE_TABLE), Field.Store.YES,
                            Field.Index.NOT_ANALYZED));
                    d.add(new Field("table", DatabaseConfig.TABLES_INDEX[i], Field.Store.YES,
                            Field.Index.NOT_ANALYZED));
                    d.add(new Field("modified", modified, Field.Store.YES, Field.Index.NOT_ANALYZED));

                    //                    System.out.println("table: " + DatabaseConfig.TABLES_INDEX[i]);
                    //                    System.out.println("column: " + rsMetaData.getColumnName(j));
                    //                    System.out.println("content: " + rs.getString(j));
                    writer.addDocument(d);
                }
            }
            stmt.close();
            con.close();
        }

        writer.optimize();
    }

    private void listFiles(File file) {
        if (!file.exists()) {
            System.out.println(file + " does not exist.");
        }
        if (!file.getName().startsWith(".")) {
            if (file.isDirectory()) {
                for (File f : file.listFiles()) {
                    listFiles(f);
                }
            } else {
                queue.add(file);
            }
        }
    }

    /**
     * Close the index.
     * @throws java.io.IOException
     */
    public void closeIndex() throws IOException {
        writer.optimize();
        writer.close();
    }
}