invertedindex.IndexCreater.java Source code

Java tutorial

Introduction

Here is the source code for invertedindex.IndexCreater.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package invertedindex;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.metadata.Metadata;
import org.json.simple.JSONObject;

/**
 *
 * @author kkgarg
 */
public class IndexCreater {

    private static Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    //    private static Analyzer sanalyzer = new SimpleAnalyzer(Version.LUCENE_47);

    private IndexWriter writer;

    public static int count = 0;

    private String getIndexPath() {
        String indexLocation = "";
        try {
            ConfigFile cf = new ConfigFile();
            indexLocation = cf.getIndexPath();
        } catch (Exception e) {
            System.out.println(e);
        }
        return indexLocation;
    }

    public IndexCreater() throws IOException {

        FSDirectory dir = FSDirectory.open(new File(getIndexPath()));

        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);

        //        File fileDir = new File(indexLocation);
        //   for(File file: fileDir.listFiles()) 
        //       if (!file.isDirectory()) 
        //           file.delete();
        //       
        // Add new documents to an existing index:
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        writer = new IndexWriter(dir, config);

        //        try{
        //            indexFileOrDirectory(f,fileContent);
        //        }
        //        catch(Exception e){
        //             System.out.println(e);
        //            
        //        }

        //closeIndex();

    }

    public void indexFileOrDirectory(JSONObject fileDetails) throws IOException {

        //        String[] metadataNames = (String[]) fileDetails.get("metadata");
        //          for(String name : metadataNames) {
        //         System.out.println(name+" : "+fileDetails.get(name));  
        //        }
        //          System.out.println("SUDHANSHU----------------------------"+fileDetails.get("Content-Type"));

        //        for(File f : queue){

        //            FileReader fr = null;
        try {
            Document doc = new Document();

            // add contents of the file

            //                fr = new FileReader(f);
            //                BufferedReader reader = new BufferedReader(fr);                                                 
            //Read br and store a line in 'data', print data
            // **** key is declared here in this block of code
            //                String key = "";
            //                String line = reader.readLine();

            //                while (line != null) {
            //                    key += line;
            //                    line = reader.readLine();
            //                }       
            //System.out.println("line is "+ fileContent);
            doc.add(new Field("id", "doc_" + count, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
            //doc.add(new Field("contents",fr , Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.add(new Field("contents", (String) fileDetails.get("contents"), Field.Store.YES,
                    Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.add(new StringField("path", (String) fileDetails.get("path"), Field.Store.YES));
            doc.add(new StringField("filename", (String) fileDetails.get("filename"), Field.Store.YES));
            doc.add(new StringField("contentType", (String) fileDetails.get("Content-Type"), Field.Store.YES));
            //                doc.add(new Field())
            /*
            doc.Add(new Field("Id", corpusDoc.Id, Field.Store.YES,
            Field.Index.NOT_ANALYZED_NO_NORMS));
                
            // Add title field
            var titleField = new Field("Title", corpusDoc.Title, Field.Store.YES,
            Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
            titleField.SetBoost(3.0f);
            doc.Add(titleField);
                
            doc.Add(new Field("Content", content, Field.Store.COMPRESS,
            Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            */

            writer.addDocument(doc);
            System.out.println("Added: " + (String) fileDetails.get("filename"));
            count++;
        } catch (Exception e) {
            System.out.println("Could not add :" + (String) fileDetails.get("filename"));
        }
        //            finally{
        //                fr.close();
        //            }
        //        }

        //        int newNumDocs = writer.numDocs();
        //        System.out.println("");
        //        System.out.println("************************");
        //        System.out.println((newNumDocs - originalNumDocs) + " documents added.");
        //        System.out.println("************************");

        //        queue.clear();
    }

    /*
    private void addFiles(File file){
        
    if(!file.exists()){
        System.out.println(file+" does not exists.");
    }
        
    if(file.isDirectory()){
        for(File f : file.listFiles()){
            addFiles(f);
        }
    }else{
        String filename = file.getName().toLowerCase();
            
        //only index text files
            
        if(filename.endsWith(".htm") || filename.endsWith(".html")
            || filename.endsWith(".xml") || filename.endsWith(".txt")){
            queue.add(file);
                
        }
        else{
            System.out.println("Skipped"+ filename);
        }
    }
        
    }
        
    */
    public void closeIndex() throws IOException {
        writer.close();
    }

}