kbp2013.index.IndexWikipediaCorpus.java Source code

Introduction

Here is the source code for kbp2013.index.IndexWikipediaCorpus.java
Source

package kbp2013.index;

/*
    
SemLinker V 0.9
Copyright (C) 2013  Eric Charton & Marie-Jean Meurs &
                Ludovic Jean-Louis & Michel Gagnon
    
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, 
Boston, MA  02110-1301, USA.
    
Contacts :
    
This software is maintained and released at:
    
https://code.google.com/p/semlinker/
    
Please contact respective authors from this page for support
or any inquiries. 
    
*/

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;

import configure.NistKBPConfiguration;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;

/** 
 * 
 * Index all text files under a directory. 
 * 
 * @deprecated This class is replaced by V2. 
 * @author ericcharton
 * @see kbp2013.index.IndexWikipediaCorpus_v2
 * 
 * */
public class IndexWikipediaCorpus {

    //--------------------------------------
    // Indexation mode
    //--------------------------------------
    private static int create = 1; // 1 -> initialize index, do not re-write  / 0 -> append to existing
    private static int checkindex = 0;

    private IndexWikipediaCorpus() {
    }

    private static String home;
    private static String index = "index";
    static File INDEX_DIR = new File(home + index);

    private static String wikiluceneIndex;
    private static String homelist;
    private static String wikidump;

    public static void initializeFromDefault() {

        NistKBPConfiguration lucenvars = new NistKBPConfiguration();

        home = lucenvars.KBP_CORPUS_PATH;
        wikidump = lucenvars.WIKI_CORPUS_FILE;
        wikiluceneIndex = lucenvars.INDEX_WIKIPEDIA;

    }

    public static void main(String[] args) throws IOException {

        initializeFromDefault();

        int managed = 0; // counter to count idents
        int counted = 0; // when to display
        int tocount = 1000;
        int saved = 0;

        System.out.println("Indexing Wikipedia Dump to directory '" + wikiluceneIndex + "'...");

        INDEX_DIR = new File(wikiluceneIndex);
        if (INDEX_DIR.exists() && create == 1) {
            System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
            System.exit(1);
        }

        Directory dir = FSDirectory.open(new File(wikiluceneIndex));

        // Open lucene stuff 
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer);
        // configure Lucene Stuff
        iwc.setMaxThreadStates(100);

        // manage append mode
        if (create == 0) {
            // add new document to an existing index
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
            // if appending, check index
            if (checkindex == 1) {
                System.out.println("Checking index ...");
                CheckIndex ci = new CheckIndex(dir);
                ci.checkIndex();
                System.out.println("End of Checking index");
            }

        } else {
            iwc.setOpenMode(OpenMode.CREATE);
        }

        // build writer
        IndexWriter writer = new IndexWriter(dir, iwc);

        // --------------------------
        //
        // Open the Wikipedia Dump
        //
        //---------------------------
        BufferedReader reader = new BufferedReader(new FileReader(wikidump));

        // read the domains
        String text = "";
        ArrayList domain = new ArrayList(); // the content retrieved according to the page key

        while (!text.contains("</siteinfo>")) {
            text = reader.readLine();
            if (text.contains("<namespace key=") && !text.contains("<namespace key=\"0")) {

                String thisnamespace = text.replaceAll("<namespace key=[^>]+>", "");
                thisnamespace = thisnamespace.replaceAll("</namespace>", "");
                thisnamespace = thisnamespace.replaceAll("^[ ]+", "");
                thisnamespace = thisnamespace + ":";
                if (!thisnamespace.contentEquals("")) {
                    domain.add(thisnamespace);
                    System.out.println("Registered domain:" + thisnamespace + ";");
                }
            }
        }

        System.out.println("--------------------------------");

        // read the pages
        while ((text = reader.readLine()) != null) {

            String textdoc = ""; // inside the file, the reader for the document
            String pagename = "";
            boolean tosave = true;

            // beginning of a page
            // accumulate
            if (text.contains("<page>")) {

                textdoc = text;

                while (!text.contains("</page>")) {
                    text = reader.readLine();
                    textdoc = textdoc + text;

                    if (text.contains("<title>")) {

                        pagename = text.replaceAll("<title>", "");
                        pagename = pagename.replaceAll("</title>", "");
                        pagename = pagename.replaceAll("[ ]{2,10}", "");
                        //System.out.println("Page:" + pagename);

                    }

                    // safety

                }

                // after page reading index document
                // verify if document 
                //         A) is not a redirect
                //         B) is not from a domain
                for (int a = 0; a < domain.size(); a++) {
                    String domaintosearch = domain.get(a).toString();
                    if (pagename.toLowerCase().contains(domaintosearch.toLowerCase())) {
                        System.out.println("Specific page:" + pagename);
                        tosave = false;
                    }
                }
                /*
                if (textdoc.contains("[A-Za-z ]+:")){
                   System.out.println("Specific page domain:" + pagename);
                   tosave = false;
                }*/
                if (textdoc.contains("#REDIRECT")) {
                    // System.out.println("Redirect:" + pagename);
                    tosave = false;
                }

                if (tosave) {
                    saved++;
                    indexDocs(writer, pagename, textdoc);
                }

                // display info
                managed++;
                counted++;

                if (managed > tocount) {
                    managed = 0;
                    System.out.println(counted + ":" + saved + ":" + pagename + ":------>" + textdoc.length());
                    // System.out.println(textdoc);
                    writer.commit();
                }
            }

        } // end while

        // close properly the index writer 
        // !! Caution !! in case of error, if this is not closed, the index is corrupted
        // and has to be regenerated
        writer.close();
        reader.close();

    }

    static void indexDocs(IndexWriter writer, String srcID, String text) {

        Document doc = new Document();

        srcID = srcID.replace("-", "_"); // normalize
        srcID = srcID.toLowerCase();

        // Use StringField for id fields and Textfield for tokenized file
        // caution : do not make mistake with the reference obj (use Lucene not Java)
        doc.add(new StringField("title", srcID, Field.Store.YES));
        doc.add(new StringField("text", text, Field.Store.YES));

        try {

            writer.addDocument(doc);

        } catch (IOException e) {
            // TODO Auto-generated catch block
            System.out.println("This is IOException in Writer");
            e.printStackTrace();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            System.out.println("This is an Exception in Writer");
            e.printStackTrace();
        }

    }

}