kbp2013.index.IndexSourceCorpus_v2.java Source code

Java tutorial

Introduction

Here is the source code for kbp2013.index.IndexSourceCorpus_v2.java

Source

package kbp2013.index;

/*
    
SemLinker V 0.9
Copyright (C) 2013  Eric Charton & Marie-Jean Meurs &
                Ludovic Jean-Louis & Michel Gagnon
    
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, 
Boston, MA  02110-1301, USA.
    
Contacts :
    
This software is maintained and released at:
    
https://code.google.com/p/semlinker/
    
Please contact respective authors from this page for support
or any inquiries. 
    
*/

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;

/**
 * 
 * This class is intended to Index with Lucene a KBP corpus, according to a file list. 
 *
 * @author ludovicjeanlouis
 */
public class IndexSourceCorpus_v2 {/*List to handle all the options*/

    private static File indexDir;
    private static String inputLstFile;
    Pattern patternTitle = Pattern.compile("<headline>(.*)</headline>");
    static Pattern docId_1 = Pattern.compile("<doc id=\"(.*)\"><");
    static Pattern docId_2 = Pattern.compile("<DOC id=\"(.*)\" type");
    static Pattern docId_3 = Pattern.compile("<DOCID>(.*)</DOCID>");
    Pattern pageStart = Pattern.compile("<page>");
    Pattern pageEnd = Pattern.compile("</page>");

    /**
     * 
     * 
     * 
     * @param  args
     * @throws IOException
     * @throws FileNotFoundException
     * @throws ClassNotFoundException
     * @throws Exception
     */
    public static void main(String[] args)
            throws IOException, FileNotFoundException, ClassNotFoundException, Exception {

        Date start = new Date();
        Directory targetIndexDir = FSDirectory.open(indexDir);
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer);

        if (indexDir.exists() == false) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);

        } else {
            System.err.println("Adding files to existing index: '" + indexDir);
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        //set ram buffer size (optional)
        iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(targetIndexDir, iwc);

        System.err.println("Indexing to directory '" + indexDir + "'...");
        int docCount = 1;

        BufferedReader reader = new BufferedReader(new FileReader(inputLstFile));
        String currentDocPath;
        while ((currentDocPath = reader.readLine()) != null) {
            currentDocPath = currentDocPath.trim();
            System.err.println("Processing file: " + currentDocPath);
            //Processing each gzip file
            InputStream fileInputStream = new BufferedInputStream(new FileInputStream(currentDocPath));
            InputStreamReader streamReader;
            GZIPInputStream zipReader = null;

            if (currentDocPath.endsWith(".gz")) {
                //case when the file to index is a gzip file
                zipReader = new GZIPInputStream(fileInputStream);
                streamReader = new InputStreamReader(zipReader);
            } else {
                streamReader = new InputStreamReader(fileInputStream);
            }

            BufferedReader br = new BufferedReader(streamReader);

            String docTitle = "";
            String fileContent = "";
            String line;
            String docId = "";
            //String rawCnt = "";

            StringBuilder pageBuffer = new StringBuilder();
            //raw content with the "\n"
            StringBuilder rawPageBuffer = new StringBuilder();

            while ((line = br.readLine()) != null) {
                if (StringUtils.contains(line.toLowerCase(), "</doc>") == true) {
                    pageBuffer.append(line).append(" ");
                    rawPageBuffer.append(line + "MY_CUSTOM_SPACE");
                    //rawCnt = rawCnt + "MY_CUSTOM_SPACE" + line;
                    if (pageBuffer.length() > 0) {
                        fileContent = pageBuffer.toString().replaceAll("  ", " ");
                        docId = extractDocId(fileContent);
                        //get the title of the page
                        docTitle = extractTitle(fileContent);
                        //get the content of the page
                        String content = extractContent(fileContent);
                        String rawContent = extractRawContent(rawPageBuffer.toString());

                        indexDocument(writer, docId, content, docTitle, rawContent);
                        System.err.println("Processed " + docCount + " documents");
                        docCount++;

                    }
                    //reset buffer
                    pageBuffer = new StringBuilder();
                    rawPageBuffer = new StringBuilder();
                    //rawCnt = "";
                }
                pageBuffer.append(line).append(" ");
                rawPageBuffer.append(line + "MY_CUSTOM_SPACE");

            }
            fileInputStream.close();
            if (currentDocPath.endsWith(".gz") && zipReader != null) {
                zipReader.close();
            }
            streamReader.close();
        }

        reader.close();
        writer.close();

        Date end = new Date();
        System.err.println(end.getTime() - start.getTime() + " total milliseconds");

    }

    /**
     * 
     * Method called to index a unique document.
     * 
     * @param index_writer
     * @param doc_id
     * @param doc_content
     * @param doc_title
     * @param originalContent
     * @throws IOException
     * @throws Exception
     */
    private static void indexDocument(IndexWriter index_writer, String doc_id, String doc_content, String doc_title,
            String originalContent) throws IOException, Exception {
        Document doc = new Document();

        doc_id = doc_id.replaceAll("-", "_").toLowerCase(); // normalize

        //add proper "\n" in the original document
        originalContent = originalContent.replaceAll("MY_CUSTOM_SPACE", "\n");

        doc.add(new StringField("id", doc_id, Field.Store.YES));
        doc.add(new StringField("text", originalContent, Field.Store.YES));
        doc.add(new TextField("title", doc_title, Field.Store.YES));
        doc.add(new TextField("cnt", doc_content, Field.Store.YES));

        index_writer.addDocument(doc);
    }

    /**
     * 
     * @param xmlContent
     * @return
     */
    static String extractDocId(String xmlContent) {
        String doc_id = "";
        //apply every id extractor regex to the text content
        Matcher form1Matcher = docId_1.matcher(xmlContent);
        Matcher form2Matcher = docId_2.matcher(xmlContent);
        Matcher form3Matcher = docId_3.matcher(xmlContent);

        if (form1Matcher.find()) {
            doc_id = form1Matcher.group(1);
            //in some cases the regex matches too much text, we need to restrict the matched content
            int i = doc_id.indexOf("\"");
            if (i > 0) {
                doc_id = doc_id.substring(0, i);

            }
        }

        if (form2Matcher.find()) {
            doc_id = form2Matcher.group(1);
        }

        if (form3Matcher.find()) {
            doc_id = form3Matcher.group(1);
        }
        if (doc_id.trim().length() == 0) {
            int start = xmlContent.toLowerCase().indexOf("id=");
            int end = xmlContent.toLowerCase().indexOf("\">");
            doc_id = xmlContent.substring(start + 4, end);
        }
        return doc_id.trim();
    }

    /**
     * 
     * @param content
     * @return
     */
    static String extractTitle(String content) {
        String title = "";
        String headerTag = "<headline>";
        try {
            int startIndex = content.toLowerCase().indexOf(headerTag) + headerTag.length();
            int endIndex = content.toLowerCase().indexOf("</headline>");
            title = content.substring(startIndex, endIndex);
        } catch (Exception e) {
        }
        return title.trim();
    }

    /**
     * 
     * @param content
     * @return
     */
    static String extractContent(String content) {
        String cnt = "";
        String tagName = "<doc";
        int startIndex = content.toLowerCase().indexOf(tagName);
        cnt = content.substring(startIndex);
        return cnt.trim();
    }

    /**
     * 
     * @param content
     * @return
     */
    static String extractRawContent(String content) {
        String cnt = "";
        String tagName = "<doc";
        int startIndex = content.toLowerCase().indexOf(tagName);
        cnt = content.substring(startIndex);
        return cnt.trim();
    }
}