kbp2013.index.IndexSourceCorpus.java Source code

Introduction

Here is the source code for kbp2013.index.IndexSourceCorpus.java
Source

package kbp2013.index;

/*
    
SemLinker V 0.9
Copyright (C) 2013  Eric Charton & Marie-Jean Meurs &
                Ludovic Jean-Louis & Michel Gagnon
    
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, 
Boston, MA  02110-1301, USA.
    
Contacts :
    
This software is maintained and released at:
    
https://code.google.com/p/semlinker/
    
Please contact respective authors from this page for support
or any inquiries. 
    
*/

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;

import configure.NistKBPConfiguration;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

/** 
 * Index all text files under a directory. 
 * 
 * @deprecated This class is replaced by V2. 
 * @author ericcharton
 * @see kbp2013.index.IndexSourceCorpus_v2
*/
public class IndexSourceCorpus {

    //--------------------------------------
    // Indexation mode
    //--------------------------------------
    private static int create = 1; // 1 -> initialize index, and do not re-write / 0 -> append to existing
    private static int checkindex = 0;

    private IndexSourceCorpus() {
    }

    private static String home;
    private static String index = "index";
    static File INDEX_DIR = new File(home + index);

    private static String luceneIndex;
    private static String homelist;

    public static void initializeFromDefault() {

        NistKBPConfiguration lucenvars = new NistKBPConfiguration();

        home = lucenvars.KBP_CORPUS_PATH;
        homelist = lucenvars.KBP_CORPUS_FILES;
        luceneIndex = lucenvars.INDEX;

    }

    public static void main(String[] args) throws IOException {

        initializeFromDefault();

        int managed = 0; // counter to count idents
        int counted = 0; // when to display
        int tocount = 10;

        System.out.println("Indexing to directory '" + luceneIndex + "'...");

        INDEX_DIR = new File(luceneIndex);
        if (INDEX_DIR.exists() && create == 1) {
            System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
            System.exit(1);
        }

        Directory dir = FSDirectory.open(new File(luceneIndex));

        // Open lucene stuff 
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);
        // iwc.setRAMBufferSizeMB(1024); // http://wiki.apache.org/lucene-java/ImproveIndexingSpeed
        iwc.setMaxThreadStates(100);

        // manage append mode
        if (create == 0) {
            // add new document to an existing index
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
            // if appending, checkindex
            if (checkindex == 1) {
                System.out.println("Checking index ...");
                CheckIndex ci = new CheckIndex(dir);
                ci.checkIndex();
                System.out.println("End of Checking index");
            }

        } else {
            iwc.setOpenMode(OpenMode.CREATE);
        }

        // build writer
        IndexWriter writer = new IndexWriter(dir, iwc);

        final File docDir = new File(home);
        System.out.println("Indexing directory '" + home + "'...");
        if (!docDir.exists() || !docDir.canRead()) {
            System.out.println("Document directory '" + docDir.getAbsolutePath()
                    + "' does not exist or is not readable, please check the path");
            System.exit(1);
        }

        // read all the files
        BufferedReader reader = new BufferedReader(new FileReader(homelist));

        // read line by line each file name
        String text = "";
        boolean verbose = true;

        while ((text = reader.readLine()) != null) {

            String filename = home + text;
            final File testFile = new File(filename);

            // verbose - remove from one line files
            if (verbose) {
                System.out.println("---V-->" + "Indexing content of " + filename);
            }

            if (testFile.isFile() && !filename.contains("\\.gz")) {

                // open file and read
                FileReader fread = new FileReader(filename);
                BufferedReader readerDoc = new BufferedReader(fread);

                // initialize variable for loop
                String fileRef = ""; // the line containing the document id
                String fromfile = ""; // the first reader for all the file
                String textdoc = ""; // inside the file the reader for the document

                while ((fromfile = readerDoc.readLine()) != null) {
                    if (fromfile.toUpperCase().contains("<DOC ID=") || fromfile.toUpperCase().contains("<DOC>")) {

                        String fromdoc = fromfile; // begin to index the DOCID (to keep good offset for collection of mention)
                        textdoc = fromfile; // initialize variable and keep the first line

                        // accumulate all the content
                        while (!fromdoc.toUpperCase().contains("</DOC>")) {

                            // collect the doc id
                            // store the current file ref
                            // it can come :
                            //     - from the last fromfile (first iteration)
                            //     - from a current iteration of fromdoc (any iteration)

                            if (fromdoc.toUpperCase().contains("<DOC ID=")
                                    || fromdoc.toUpperCase().contains("<DOCID>")) {
                                fileRef = fromdoc;
                            }

                            // accumulate the complete document for later offset reading of mention
                            fromdoc = readerDoc.readLine();
                            textdoc = textdoc + "\n" + fromdoc;

                        }

                        // locate id
                        // 2 forms
                        // <DOCID> ALHURRA_NEWS13_ARB_20050412_130100-2.LDC2006E92 </DOCID>
                        // <doc id="bolt-eng-DF-183-195681-7948494">
                        // form 1
                        String idStr = fileRef;

                        if (idStr.contains("<DOCID>")) {
                            idStr = idStr.replace("<DOCID>", "");
                            idStr = idStr.replace("</DOCID>", "");
                            idStr = idStr.replace(" ", ""); // retire l'espace
                        }
                        if (idStr.contains("<DOC id=")) {

                            idStr = idStr.replace("<DOC id=\"", "");
                            idStr = idStr.replaceAll("\".+>$", "");
                            //idStr = idStr.replaceAll("\">$", "");
                        }
                        // lower case ->new corpus of LDC
                        /*
                        if (idStr.contains("<docid>")){
                           idStr = idStr.replace("<docid>", "");
                           idStr = idStr.replace("</docid>", "");
                           idStr = idStr.replace(" ", ""); // retire l'espace
                        }
                        if (idStr.contains("<doc id=")){
                            
                           idStr = idStr.replace("<doc id=\"", "");
                           idStr = idStr.replaceAll("\".+>$", "");
                           // idStr = idStr.replaceAll("\">$", "");
                        }      
                        */

                        indexDocs(writer, idStr, textdoc);

                        // display info
                        managed++;
                        counted++;

                        // verbose remove for 1 doc files
                        if (verbose) {
                            System.out.println(
                                    "---V-->" + counted + ":" + filename + ":" + idStr + ":" + textdoc.length());
                        }

                        if (managed > tocount) {
                            managed = 0;
                            System.out.println(counted + ":" + filename + ":------>" + idStr);

                            // clean the writer
                            //writer.waitForMerges();
                            //writer.forceMergeDeletes();
                            writer.commit();
                        }
                    } // end of if

                } // end of while
                readerDoc.close();
                fread.close();

            } else {

                System.out.println(counted + ":Non lisible ou non requis:" + filename);

            }

        }

        // close properly the index writer 
        // !! Caution !! in case of error, if this is not closed, the index is corrupted
        // and has to be regenerated
        writer.close();
        reader.close();

    }

    static void indexDocs(IndexWriter writer, String srcID, String text) {

        Document doc = new Document();

        srcID = srcID.replace("-", "_"); // normalize
        srcID = srcID.toLowerCase();

        // Use StringField for id fields and Textfield for tokenized file
        // caution : do not make mistake with the reference obj (use Lucene not Java)
        doc.add(new StringField("id", srcID, Field.Store.YES));
        doc.add(new StringField("text", text, Field.Store.YES));
        // doc.add(new StoredField("text", text));

        try {

            writer.addDocument(doc);

        } catch (IOException e) {
            // TODO Auto-generated catch block
            System.out.println("This is IOException in Writer");
            e.printStackTrace();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            System.out.println("This is an Exception in Writer");
            e.printStackTrace();
        }

    }

}