indexer.AMIIndexer.java Source code

Introduction

Here is the source code for indexer.AMIIndexer.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package indexer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;

/**
 * Write the csv formatted annotations in a Lucene index.
 * Each sentence is to be stored as a separate document in the index.
 * Each sentence contains a field which has the document name.
 * @author Debasis
 */

public class AMIIndexer {

    Properties prop;
    File indexDir;
    Analyzer analyzer;
    IndexWriter writer;
    List<String> stopwords;

    public static final String SENTENCE_DELIMS = ".?!";
    public static final String DELIMS = ",;.!'\".?$&*(){}[]<>/\\|";
    public static final String WORD_LABEL_DELIM = "_";

    public AMIIndexer(String propFile) throws Exception {
        prop = new Properties();
        prop.load(new FileReader(propFile));
        analyzer = constructAnalyzer(prop.getProperty("stopfile"));
        String indexPath = prop.getProperty("index");
        indexDir = new File(indexPath);
    }

    static public List<String> buildStopwordList(String stopwordFileName) {
        List<String> stopwords = new ArrayList<>();
        String line;

        try (FileReader fr = new FileReader(stopwordFileName); BufferedReader br = new BufferedReader(fr)) {
            while ((line = br.readLine()) != null) {
                stopwords.add(line.trim());
            }
            br.close();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return stopwords;
    }

    static public Analyzer constructAnalyzer(String stopwordFileName) {
        Analyzer eanalyzer = new EnglishAnalyzer(StopFilter.makeStopSet(buildStopwordList(stopwordFileName))); // default analyzer
        return eanalyzer;
    }

    public void process() throws Exception {
        System.out.println("Indexing AMI annotations...");

        IndexWriterConfig iwcfg = new IndexWriterConfig(analyzer);
        iwcfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        writer = new IndexWriter(FSDirectory.open(indexDir.toPath()), iwcfg);
        indexAnnotation();
        writer.close();
    }

    boolean isEOS(String word) {
        for (char eos : AMIIndexer.SENTENCE_DELIMS.toCharArray()) {
            if (word.charAt(0) == eos)
                return true;
        }
        return false;
    }

    boolean isPunct(String word) {
        char ch = word.charAt(0);
        return DELIMS.indexOf(ch) >= 0;
    }

    int getLabel(String label) {
        if (label.equals("nan"))
            return 0;
        else if (label.equalsIgnoreCase("yes") || label.equalsIgnoreCase("maybe"))
            return 1;
        else if (label.equalsIgnoreCase("no"))
            return 0;
        return (int) (Float.parseFloat(label));
    }

    boolean isConsecutiveSegment(String prevSegmentId, String thisSegmentId, String prevFileName,
            String thisFileName) {
        if (!prevFileName.equals(thisFileName))
            return false;
        int prevSegment = Integer.parseInt(prevSegmentId);
        int thisSegment = Integer.parseInt(thisSegmentId);
        return thisSegment == prevSegment + 1;
    }

    void indexAnnotation() throws Exception {
        boolean indexSentence = prop.getProperty("doc.unit").equals("sentence"); // sentence/segment
        String annotationFile = prop.getProperty("annotation.csvfile");
        FileReader fr = new FileReader(annotationFile);
        BufferedReader br = new BufferedReader(fr);

        String line, prevFileName = null, prevSegmentId = null;
        StringBuffer buff = new StringBuffer();
        int dec = 0, pref = 0;
        int labelCol = Integer.parseInt(prop.getProperty("decs.column"));
        boolean writeLabelsForEachWord = Boolean.parseBoolean(prop.getProperty("word.labels", "false"));

        while ((line = br.readLine()) != null) {
            String[] tokens = line.split("\t");
            String word = tokens[0];
            String fileName = tokens[2];
            String segmentId = tokens[3];

            // Propagate the maximum for this sentence/segment
            int decLabel = getLabel(tokens[labelCol]);
            if (decLabel > dec)
                dec = decLabel;
            int prefLabel = getLabel(tokens[20]);
            if (prefLabel > pref)
                pref = prefLabel;

            String wordWithPayload = new String(word);
            word = word.replace(WORD_LABEL_DELIM, "");

            if (writeLabelsForEachWord) {
                wordWithPayload = word + WORD_LABEL_DELIM + decLabel + WORD_LABEL_DELIM + prefLabel + " ";
            }

            if (indexSentence) {
                if (isEOS(word)) {
                    buff.append(wordWithPayload);
                    Document d = constructDoc(buff.toString(), fileName, segmentId, tokens[9], dec, pref);
                    writer.addDocument(d);
                    buff = new StringBuffer();
                    dec = 0;
                    pref = 0;
                    continue;
                }
            } else if (prevFileName != null && prevSegmentId != null) { // if there is a change in segment id (for same file) or a change in the file name itself
                if (isConsecutiveSegment(prevSegmentId, segmentId, prevFileName, fileName)) {
                    Document d = constructDoc(buff.toString(), fileName, fileName + "_" + segmentId, tokens[9], dec,
                            pref);
                    writer.addDocument(d);
                    buff = new StringBuffer();
                    dec = 0;
                    pref = 0;
                    prevSegmentId = segmentId;
                    prevFileName = fileName;
                    buff.append(wordWithPayload);
                    continue;
                }
            }

            prevFileName = fileName;
            prevSegmentId = segmentId;

            if (!isPunct(word)) {
                buff.append(" ");
            } else if (indexSentence && buff.length() > 0 && isEOS(word)) {
                buff.deleteCharAt(buff.length() - 1);
            }
            buff.append(wordWithPayload);
        }
    }

    // wordOffset comes from the previous line...
    public Document constructDoc(String sentence, String fileName, String sentenceId, String speaker, int dec,
            int pref) throws Exception {

        System.out.println("Storing sentence/segment " + sentenceId + " of doc: " + fileName);

        Document doc = new Document();
        // Meta
        doc.add(new Field(AMI_FIELDS.FIELD_DOC_NAME, fileName, Field.Store.YES, Field.Index.NOT_ANALYZED));
        // sentence id within segment or 
        doc.add(new Field(AMI_FIELDS.FIELD_SENTENCE_ID, sentenceId, Field.Store.YES, Field.Index.NOT_ANALYZED));

        // content
        doc.add(new Field(AMI_FIELDS.FIELD_CONTENT, sentence, Field.Store.YES, Field.Index.ANALYZED,
                Field.TermVector.YES));

        // class labels
        doc.add(new Field(AMI_FIELDS.FIELD_SPEAKER_ID, speaker, Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field(AMI_FIELDS.FIELD_DECISION_SCORE, String.valueOf(dec), Field.Store.YES,
                Field.Index.NOT_ANALYZED));
        doc.add(new Field(AMI_FIELDS.FIELD_PREF_SCORE, String.valueOf(pref), Field.Store.YES,
                Field.Index.NOT_ANALYZED));

        return doc;
    }

    public static void main(String[] args) {
        if (args.length == 0) {
            args = new String[1];
            System.out.println("Usage: java AMIIndexer <prop-file>");
            args[0] = "init.properties";
        }

        try {
            AMIIndexer indexer = new AMIIndexer(args[0]);
            indexer.process();
        } catch (Exception ex) {
            ex.printStackTrace();
        }

    }
}