org.punksearch.crawler.IndexOperator.java Source code

Java tutorial

Introduction

Here is the source code for org.punksearch.crawler.IndexOperator.java

Source

/***************************************************************************
 *                                                                         *
 *   PunkSearch - Searching over LAN                                       *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/
package org.punksearch.crawler;

import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.punksearch.common.IndexFields;
import org.punksearch.crawler.analysis.FilenameAnalyzer;
import org.punksearch.util.lucene.LuceneUtils;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Utility class to work with an index. Hides all Lucene's read/write logics.
 * <p/>
 * It has both instance and static methods. The instance methods affect the index directory IndexOperator was
 * instantiated with. The static methods are for utility tasks -- they open and close an index themselves.
 *
 * @author Yury Soldak (ysoldak@gmail.com)
 */
public class IndexOperator {
    private static final Log log = LogFactory.getLog(IndexOperator.class);

    private IndexWriter indexWriter;
    private static Analyzer analyzer = createAnalyzer();

    public static final String WRITE_LOCK_FILE = "write.lock";

    /**
     * Creates an instance of the class and opens index directory.
     *
     * @param indexDirectory Index directory to operate under.
     */
    public IndexOperator(String indexDirectory) {
        try {
            this.indexWriter = createIndexWriter(indexDirectory);
        } catch (IOException e) {
            throw new IllegalArgumentException(e);
        }
    }

    /**
     * Adds documents to the current index.
     *
     * @param documentList List of Document. Null value is synonymous to an empty list.
     * @return Boolean result. False in the case of internal I/O exception.
     * @throws IllegalStateException In the case "close" method was called before.
     */
    public boolean addDocuments(List<Document> documentList) {
        if (isClosed()) {
            throw new IllegalStateException("Index was closed already.");
        }

        if (documentList == null || documentList.size() == 0) {
            return true;
        }

        try {
            for (Document document : documentList) {
                indexWriter.addDocument(document);
            }
            return true;
        } catch (IOException e) {
            log.warn("Failed adding documents into index. " + e.getMessage());
            return false;
        }

    }

    /**
     * Deletes all documents (what match IP and protocol) from the current index.
     *
     * @param ip    IP of a host to delete documents for.
     * @param proto Protocol to match for documents to be deleted. "smb" or "ftp".
     * @return Boolean result. False in the case of internal I/O exception.
     * @throws IllegalStateException In the case "close" method was called before.
     */
    public boolean deleteDocuments(String ip, String proto) {
        if (isClosed()) {
            throw new IllegalStateException("Index was closed already.");
        }
        try {
            indexWriter.deleteDocuments(new Term(IndexFields.HOST, proto + "_" + ip));
            return true;
        } catch (IOException e) {
            log.warn("Failed deleting documents for host '" + proto + "://" + ip + "'. " + e.getMessage());
            return false;
        }
    }

    /**
     * Optimizes the current index.
     *
     * @throws IllegalStateException In the case "close" method was called before.
     */
    public void optimize() {
        if (isClosed()) {
            throw new IllegalStateException("Index was closed already.");
        }
        try {
            indexWriter.optimize();
        } catch (CorruptIndexException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /* *
      * Ensures all data was merged into the current index on disk.
      *
      * @throws IllegalStateException
      *             In the case "close" method was called before.
      */
    /*public void flush() {
     if (isClosed()) {
         throw new IllegalStateException("Index was closed already.");
     }
     try {
         indexWriter.flush();
     } catch (CorruptIndexException e) {
         throw new RuntimeException(e);
     } catch (IOException e) {
         throw new RuntimeException(e);
     }
     }*/

    /**
     * Closes the index, unlocks it. It is not possible to use any instance methods except "isClosed" after this method
     * was called.
     *
     * @throws IllegalStateException In the case "close" method was called before.
     */
    public void close() {
        if (isClosed()) {
            throw new IllegalStateException("Index was closed already.");
        }
        try {
            indexWriter.close();
            indexWriter = null;
        } catch (CorruptIndexException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Checks if index was closed. Use this method to check (if unsure) if you can call instance methods.
     *
     * @return True if index was closed (i.e. "close" method called before).
     */
    public boolean isClosed() {
        return indexWriter == null;
    }

    public static void deleteByHost(String dir, String host, String hostName) {
        try {
            IndexWriter iw = createIndexWriter(dir);

            final BooleanQuery delQuery = new BooleanQuery();
            delQuery.add(new TermQuery(new Term(IndexFields.HOST, host)), BooleanClause.Occur.SHOULD);
            delQuery.add(new TermQuery(new Term(IndexFields.HOST_NAME, hostName)), BooleanClause.Occur.SHOULD);

            iw.deleteDocuments(delQuery);
            iw.close();
        } catch (IOException ex) {
            log.error("Exception during merging index directories", ex);
            throw new RuntimeException(ex);
        }
    }

    public static void deleteAll(String dir) {
        try {
            File indexDir = new File(dir);
            if (indexDir.exists()) {
                FileUtils.cleanDirectory(indexDir);
                IndexWriter iw = createIndexWriter(dir);
                iw.close();
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static void deleteByAge(String dirPath, float days) {
        try {
            final Directory dir = LuceneUtils.dir(dirPath);
            boolean indexExists = IndexReader.indexExists(dir);
            if (!indexExists) {
                return;
            }

            final IndexWriter iw = createIndexWriter(dirPath);
            final IndexReader ir = IndexReader.open(dir);
            IndexSearcher is = new IndexSearcher(ir);

            long min = 0;
            long max = System.currentTimeMillis() - Math.round(days * 1000 * 3600 * 24);

            final TermRangeQuery oldDocsQuery = new TermRangeQuery(IndexFields.INDEXED,
                    DateTools.timeToString(min, DateTools.Resolution.MILLISECOND),
                    DateTools.timeToString(max, DateTools.Resolution.MILLISECOND), true, false);

            final int docsInReader = ir.numDocs();
            final TopDocs topDocs = is.search(oldDocsQuery, Math.max(1, docsInReader));
            log.info("Deleting by age from index directory. Items to delete: " + topDocs.totalHits);

            iw.deleteDocuments(oldDocsQuery);

            iw.close();
        } catch (IOException ex) {
            log.error("Exception during deleting by age from index directory", ex);
            throw new RuntimeException(ex);
        }
    }

    public static void merge(String targetDir, Set<String> sourceDirs) {
        try {
            IndexWriter iw = createIndexWriter(targetDir);
            Directory[] dirs = new Directory[sourceDirs.size()];
            int i = 0;
            for (String source : sourceDirs) {
                dirs[i] = LuceneUtils.dir(source);
                i++;
            }
            iw.addIndexes(dirs);
            iw.commit();
            iw.close();
        } catch (IOException ex) {
            log.error("Exception during merging index directories", ex);
            throw new RuntimeException(ex);
        }
    }

    private static IndexWriter createIndexWriter(String dir) throws IOException {
        return createIndexWriter(dir, -1);
    }

    /**
     * @param dir      to open writer for
     * @param segments less segments number - slower indexing but faster search and vice versa, 0 - means default
     * @return new index writer
     * @throws IOException
     */
    private static IndexWriter createIndexWriter(String dir, int segments) throws IOException {
        final IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneVersion.VERSION, analyzer);

        if (segments > 0) {
            indexWriterConfig.setMergePolicy(
                    new TieredMergePolicy().setSegmentsPerTier(segments).setMaxMergeAtOnce(segments));
        }

        return new IndexWriter(FSDirectory.open(new File(dir)), indexWriterConfig);
    }

    public static boolean isLocked(String dir) {
        try {
            return LuceneUtils.dir(dir).fileExists(WRITE_LOCK_FILE);
        } catch (IOException e) {
            log.warn("IOException during checking if index directory is locked, "
                    + "assuming it is not (maybe index directory just does not exist?)");
            return false;
        }
    }

    public static void unlock(String dir) {
        try {
            log.trace("Clearing lock: " + dir);

            final Directory d = LuceneUtils.dir(dir);
            d.clearLock(WRITE_LOCK_FILE);
            d.close();
        } catch (IOException e) {
            log.error("Error clearing lock", e);
        }
    }

    public static boolean indexExists(String dir) {
        try {
            final File dirFile = new File(dir);
            return dirFile.isDirectory() && IndexReader.indexExists(LuceneUtils.dir(dir));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static void createIndex(String dir) throws IOException {
        IndexWriter iw = createIndexWriter(dir);
        iw.close();
    }

    private static Analyzer createAnalyzer() {
        final FilenameAnalyzer filenameAnalyzer = new FilenameAnalyzer();

        Map<String, Analyzer> analyzerMap = new HashMap<String, Analyzer>();
        analyzerMap.put(IndexFields.NAME, filenameAnalyzer);
        analyzerMap.put(IndexFields.PATH, filenameAnalyzer);

        return new PerFieldAnalyzerWrapper(new KeywordAnalyzer(), analyzerMap);
    }

    public static void optimize(String dir) {
        IndexWriter iw;
        try {
            iw = createIndexWriter(dir);
            iw.forceMerge(5);
            iw.close();
        } catch (IOException e) {
            log.error("Exception during optimizing index directory '" + dir + "': " + e.getMessage());
        }
    }
}