org.scify.NewSumServer.Server.Searching.Indexer.java Source code

Java tutorial

Introduction

Here is the source code for org.scify.NewSumServer.Server.Searching.Indexer.java

Source

/*
 * Copyright 2013 SciFY NPO <info@scify.org>.
 *
 * This product is part of the NewSum Free Software.
 * For more information about NewSum visit
 * 
 *    http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * If this code or its output is used, extended, re-engineered, integrated, 
 * or embedded to any extent in another software or hardware, there MUST be 
 * an explicit attribution to this work in the resulting source code, 
 * the packaging (where such packaging exists), or user interface 
 * (where such an interface exists). 
 * The attribution must be of the form "Powered by NewSum, SciFY"
 */

package org.scify.NewSumServer.Server.Searching;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.sql.Time;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LimitTokenCountAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.scify.NewSumServer.Server.Utils.Main;
import org.scify.NewSumServer.Server.Utils.Utilities;

/**
 * The Class used for Indexing
 * @author George K. <gkiom@scify.org>
 */
public class Indexer {

    private static final String FILE_FIELD = "file";
    private static final String TEXT_FIELD = "text";
    /**
     * The Directory containing the Index Files
     */
    private static File indexDir;
    /**
     * The Absolute path to the Directory where the Files to be indexed are
     */
    private String sFilesPath;
    /**
     * The Absolute path to the Directory where the Indexed Files are stored
     */
    private String sIndexPath;

    private Locale lLoc;

    private Analyzer anal;
    /**
     * The Global Logger Class.
     */
    protected final static Logger LOGGER = Main.getLogger();

    //    public final String sFileSeparator = System.getProperty("file.separator");

    /**
     *
     * @param sFilesPath The Absolute path to the Directory where the Files to be indexed are
     * @param sIndexPath The Absolute path to the Directory where the Indexed Files are stored
     * @param loc The locale that the files will be indexed with
     */
    public Indexer(String sFilesPath, String sIndexPath, Locale loc) {
        this.sFilesPath = sFilesPath;
        this.sIndexPath = sIndexPath;
        this.lLoc = loc;
        // The dir the Index files will be saved in
        indexDir = new File(this.sIndexPath);
    }

    /**
     * The Main method of the Indexer Class.
     * Traverses a directory and creates the index files needed for the package to
     * operate.
     * @throws CorruptIndexException
     * @throws LockObtainFailedException
     * @throws IOException
     */
    public void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
        // The dir containing the Files to Index
        File docDir = new File(this.sFilesPath);
        Directory FSDir = FSDirectory.open(indexDir);
        //init the Analyzer, according to locale
        if (lLoc.toString().equals("el")) {
            anal = new GreekAnalyzer(Version.LUCENE_36);
        } else if (lLoc.toString().equals("en")) {
            // The standard analyzer
            Analyzer stdAnal = new StandardAnalyzer(Version.LUCENE_36);
            // In order to index all the text in a field,
            // however long that field may be
            anal = new LimitTokenCountAnalyzer(stdAnal, Integer.MAX_VALUE);
        }
        // The configuration for the Index Writer
        IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, anal);
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        // The Index Writer
        IndexWriter indexWriter = new IndexWriter(FSDir, conf);
        // For each File in the dir, create a Document
        for (File file : getFilesFromFirstLeverSubdirs(docDir)) {
            String filename = file.getName();
            String fullFileName = file.getAbsolutePath();
            String tmpText = Utilities.readFromFile(fullFileName, " ");
            Document d = new Document(); //lucene Document
            // Add the "filename" field
            d.add(new Field(FILE_FIELD, filename, Field.Store.YES, Field.Index.NOT_ANALYZED));
            // Add The "Text" Field
            d.add(new Field(TEXT_FIELD, tmpText, Field.Store.YES, Field.Index.ANALYZED));

            // Add the Document to the Writer
            indexWriter.addDocument(d);
        }
        int numDocs = indexWriter.numDocs();
        // the index will be merged down into a single segment, resulting in
        // a smaller index with better search performance. Costly Operation,
        // DO NOT USE on large dirs or when low disk space (needs (2-3)*DirSize)
        indexWriter.forceMerge(1);
        // Syncs All referenced Index Files.
        // At this point old indexes will be deleted, freeing up space
        indexWriter.commit();
        // Terminate the Writer appropriately
        indexWriter.close();
        //        LOGGER.log(Level.INFO, "Succesfully closed indexWriter with {0}", anal.toString());
    }

    /**
     *
     * @return The Directory that the Index Files are in
     */
    public File getIndexDirectory() {
        //        Logger.getAnonymousLogger().log(Level.INFO, "INDEXER-->INDEXPATH: {0}", indexDir);
        return indexDir;
    }
    //    public Analyzer getAnalyzer() {
    //        return anal;
    //    }

    /**
     * Get the files in the subdirs of a given directory.
     * If the given file is not a directory, return an empty list.
     * @param fDir The directory to analyze.
     * @return The list of (non-directory) files in the subdirectories.
     */
    protected List<File> getFilesFromFirstLeverSubdirs(File fDir) {
        ArrayList<File> lRes = new ArrayList<File>();
        // Only analyze, if a directory
        if (fDir.isDirectory()) {
            // Check only first level
            lRes.addAll(Arrays.asList(fDir.listFiles(new FileFilter() {

                @Override
                public boolean accept(File file) {
                    // Do NOT accept directories
                    return !file.isDirectory();
                }
            })));

            // For every subdir
            for (File fSubDir : fDir.listFiles(new FileFilter() {

                @Override
                public boolean accept(File file) {
                    // Do NOT accept directories
                    return file.isDirectory();
                }
            })) {
                // Get children files (Recursion)
                lRes.addAll(getFilesFromFirstLeverSubdirs(fSubDir));
            }
        }

        // Return result
        return lRes;
    }
}
//final class LuceneUtil {
//
//  private LuceneUtil() {}
//
//  public static List<String> tokenizeString(Analyzer analyzer, String string) {
//    List<String> result = new ArrayList<String>();
//    try {//or 'field' instead of null
//      TokenStream stream  = analyzer.tokenStream(null, new StringReader(string));
//      while (stream.incrementToken()) {
//        result.add(stream.getAttribute(CharTermAttribute.class).toString());
//      }
//    } catch (IOException e) {
//      // not thrown b/c we're using a string reader...
//      throw new RuntimeException(e);
//    }
//    return result;
//  }
//
//}