di.uniba.it.tee2.index.TemporalEventIndexing.java Source code

Introduction

Here is the source code for di.uniba.it.tee2.index.TemporalEventIndexing.java
Source

/**
 * Copyright (c) 2014, the TEE2 AUTHORS.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * Neither the name of the University of Bari nor the names of its contributors
 * may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007
 *
 */
package di.uniba.it.tee2.index;

import di.uniba.it.tee2.analyzer.EnglishNoStemAnalyzer;
import di.uniba.it.tee2.analyzer.ItalianNoStemAnalyzer;
import di.uniba.it.tee2.extraction.TemporalExtractor;
import di.uniba.it.tee2.data.TaggedText;
import di.uniba.it.tee2.data.TimeEvent;
import java.io.File;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class TemporalEventIndexing {

    private Analyzer analyzer;
    private TemporalExtractor tempExtractor;
    private FSDirectory time_index;
    private FSDirectory doc_index;
    private FSDirectory docrep_index;
    private IndexWriter time_writer;
    private IndexWriter doc_writer;
    private IndexWriter docrep_writer;
    private int contextSize = 256;
    private static final Logger logger = Logger.getLogger(TemporalEventIndexing.class.getName());

    /**
     * @param lang
     * @param mainDir
     * @throws IOException
     *
     */
    public void init(String lang, String mainDir) throws IOException {
        tempExtractor = new TemporalExtractor(lang);
        tempExtractor.init();
        time_index = FSDirectory.open(new File(mainDir + "/time"));
        doc_index = FSDirectory.open(new File(mainDir + "/doc"));
        docrep_index = FSDirectory.open(new File(mainDir + "/repo"));
        switch (lang) {
        case "italian":
            analyzer = new ItalianNoStemAnalyzer(Version.LUCENE_48);
            break;
        case "english":
            analyzer = new EnglishNoStemAnalyzer(Version.LUCENE_48);
            break;
        default:
            analyzer = new StandardAnalyzer(Version.LUCENE_48);
            break;
        }
        IndexWriterConfig configTime = new IndexWriterConfig(Version.LUCENE_48, analyzer);
        configTime.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        time_writer = new IndexWriter(time_index, configTime);
        IndexWriterConfig configDoc = new IndexWriterConfig(Version.LUCENE_48, analyzer);
        configDoc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        doc_writer = new IndexWriter(doc_index, configDoc);
        IndexWriterConfig configDocRep = new IndexWriterConfig(Version.LUCENE_48, analyzer);
        configDoc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        docrep_writer = new IndexWriter(docrep_index, configDocRep);

    }

    /**
     * @throws IOException
     *
     */
    public void close() throws IOException {
        //close writers
        time_writer.close();
        doc_writer.close();
        docrep_writer.close();
    }

    public int getContextSize() {
        return contextSize;
    }

    public void setContextSize(int contextSize) {
        this.contextSize = contextSize;
    }

    /**
     * Crea e memorizza un documento xml a partire dalla stringa fornita in
     * input dopo averla taggata usando HeidelTime.
     *
     * @param title
     * @param content
     * @param fileName
     * @param docID
     * @param wikiID
     * @param revisionID
     * @throws java.lang.Exception
     */
    public void add(String title, String content, String fileName, String docID, int wikiID, int revisionID)
            throws Exception {
        TaggedText tt = null;
        try {
            tt = tempExtractor.process(content);
        } catch (Exception ex) {
            logger.log(Level.WARNING, "Error to process doc " + docID + " (skip doc)", ex);
        }
        if (tt != null) {

            //stores id and text (not tagged) in docrep_index (document repository)
            Document docrep_doc = new Document();
            docrep_doc.add(new StringField("id", docID, Field.Store.YES));
            docrep_doc.add(new IntField("wikiID", wikiID, Field.Store.YES));
            docrep_doc.add(new IntField("revisionID", revisionID, Field.Store.YES));
            docrep_doc.add(new StringField("title", title, Field.Store.YES));
            docrep_doc.add(new StoredField("content", tt.getText()));
            docrep_doc.add(new StringField("filename", fileName, Field.Store.YES));
            docrep_writer.addDocument(docrep_doc);

            //stores id and text (not tagged) in doc_index for search
            Document doc_doc = new Document();
            doc_doc.add(new StringField("id", docID, Field.Store.YES));
            doc_doc.add(new IntField("wikiID", wikiID, Field.Store.YES));
            doc_doc.add(new IntField("revisionID", revisionID, Field.Store.YES));
            doc_doc.add(new TextField("title", title, Field.Store.NO));
            doc_doc.add(new TextField("content", tt.getText(), Field.Store.NO));
            doc_writer.addDocument(doc_doc);

            logger.log(Level.FINE, "Found {0} temporal events", tt.getEvents().size());

            for (TimeEvent event : tt.getEvents()) { //for each TIMEX3 store info time index
                //stores id, file name and text (TimeML tagged) in time_index
                Document time_doc = new Document();
                time_doc.add(new StringField("id", docID, Field.Store.YES));
                //time_doc.add(new StringField("file", fileName, Field.Store.YES));
                //time_doc.add(new TextField("content", tt.getTaggedText(), Field.Store.NO));
                /*FieldType ft = new FieldType();
                 ft.setStoreTermVectors(true);
                 ft.setTokenized(true);
                 ft.setStored(true);
                 ft.setIndexed(true);
                 ft.setStoreTermVectorPositions(true);
                 ft.setOmitNorms(false);*/

                time_doc.add(new StringField("time", event.getDateString(), Field.Store.YES));
                time_doc.add(new IntField("offset_start", event.getStartOffset(), Field.Store.YES));
                time_doc.add(new IntField("offset_end", event.getEndOffset(), Field.Store.YES));
                time_doc.add(new TextField("context",
                        getTimeContext(tt.getText(), event.getStartOffset(), event.getEndOffset()),
                        Field.Store.NO));
                time_writer.addDocument(time_doc);
            }
        }
    }

    public String getTimeContext(String content, int startOffset, int endOffset) {
        int start = Math.max(0, startOffset - contextSize);
        int end = Math.min(content.length(), endOffset + contextSize);
        while (start > 0 && !Character.isWhitespace(content.charAt(start))) {
            start--;
        }
        while (end < content.length() && !Character.isWhitespace(content.charAt(end))) {
            end++;
        }
        return content.substring(start, end);
    }

    public static String getTimeContext(String content, int startOffset, int endOffset, int contextSize) {
        int start = Math.max(0, startOffset - contextSize);
        int end = Math.min(content.length(), endOffset + contextSize);
        while (start > 0 && !Character.isWhitespace(content.charAt(start))) {
            start--;
        }
        while (end < content.length() && !Character.isWhitespace(content.charAt(end))) {
            end++;
        }
        return content.substring(start, end);
    }

}