de.citec.lucene.CreateIndex.java Source code

Java tutorial

Introduction

Here is the source code for de.citec.lucene.CreateIndex.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package de.citec.lucene;

import de.citec.util.Language;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

/**
 *
 * @author swalter
 */
public class CreateIndex {

    private static int counter = 0;

    public static void main(String[] args) throws IOException {
        Analyzer analyzer = null;

        List<String> files = new ArrayList<>();
        files.add("/Users/swalter/Documents/EsaDeutsch/new_copus_german.txt");
        String indexPath = "/Users/swalter/Documents/EsaDeutsch/Index/";
        Language language = Language.DE;
        Directory dir = FSDirectory.open(Paths.get(indexPath));

        if (language.equals(Language.DE))
            analyzer = new GermanAnalyzer();
        if (language.equals(Language.ES))
            analyzer = new SpanishAnalyzer();
        if (language.equals(Language.EN))
            analyzer = new EnglishAnalyzer();

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        iwc.setOpenMode(OpenMode.CREATE);
        iwc.setRAMBufferSizeMB(12000);
        try (IndexWriter writer = new IndexWriter(dir, iwc)) {
            files.forEach(f -> {
                try {
                    indexDocs(writer, Paths.get(f));
                } catch (IOException ex) {
                    Logger.getLogger(CreateIndex.class.getName()).log(Level.SEVERE, null, ex);
                }
            });

        }
        System.out.println(counter);

    }

    static void indexDocs(final IndexWriter writer, Path path) throws IOException {
        Stream<String> lines = Files.lines(path);
        lines.forEach(s -> indexDoc(writer, s));
    }

    //            name = name.replace("_piat", "");
    //            name = name.replace("_adv", "");
    //            name = name.replace(("_pper"), "");
    //            name = name.replace(("_appr"), "");
    //            name = name.replace("_card", "");
    //            name = name.replace("_xy ", "");
    //            name = name.replace("_card .", "");
    //            name = name.replace("_NN","");
    //            name = name.replace("_ADJA","");
    //            name = name.replace("_ADJD","");
    //            name = name.replace("_NE","");
    //            name = name.replace("_appr", "");

    private static void indexDoc(final IndexWriter writer, String input) {

        try {
            input = input.replace("\n", "");
            String[] tmp = input.split("######");
            String persons = tmp[0];
            String id = tmp[1];
            String name = tmp[2];
            name = name.replace("-RRB-_TRUNC", "");
            name = name.replace("-LRB-_TRUNC", "");
            name = name.replace("._$.", "");
            name = name.replace(" ._$.", "");
            name = name.replace("_$.", "");
            name = name.replace("/_$[", "");
            name = name.replace("-_$[", "");
            name = name.replace("_$[", "");
            name = name.replace(" 's", "s");
            name = name.replace("' ", "");
            name = name.replace("'", " ");
            name = name.replace("  ", " ");
            if (name.contains(" ")) {
                String[] tmp_string = name.split(" ");
                name = "";
                for (String s : tmp_string) {
                    if (s.contains("_")) {
                        name += " " + s.split("_")[0];
                    } else
                        name += " " + s;
                }
            }
            name = name.trim();
            if (tmp.length < 4) {
                //System.out.println(input);
                counter += 1;
            } else {
                //System.out.println(name);
                String text = tmp[3];
                Document doc = new Document();
                Field field_persons = new StringField("persons", persons, Field.Store.NO);
                Field field_id = new StringField("id", id, Field.Store.YES);
                Field field_name = new TextField("name", name, Field.Store.YES);
                Field field_text = new TextField("text", text.toLowerCase(), Field.Store.NO);
                doc.add(field_persons);
                doc.add(field_id);
                doc.add(field_name);
                doc.add(field_text);
                writer.addDocument(doc);
            }

        } catch (Exception e) {
            e.printStackTrace();
            //System.out.println("Problem with:"+input);
        }

    }

}