edu.udel.ece.infolab.btc.Indexing.java Source code

Introduction

Here is the source code for edu.udel.ece.infolab.btc.Indexing.java
Source

/**
 * Copyright 2011, Campinas Stephane
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/**
 * @project trec-entity-tool
 * @author Campinas Stephane [ 3 Jun 2011 ]
 * @link stephane.campinas@deri.org
 */
package edu.udel.ece.infolab.btc;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import org.sindice.siren.analysis.TupleAnalyzer;
import org.sindice.siren.analysis.TupleAnalyzer.URINormalisation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Index a list of entities, creating incoming, outgoing triples fields, subject
 * and type fields. The type field is a grouping of the rdf:type objects for this
 * entity.<br>
 * Outgoing triples are stored as n-tuples where a predicate has all its related
 * values.
 * Incoming triples are also stored as n-tuples, the difference being that a
 * predicate possess its related subject URIs.
 */
public abstract class Indexing {

    protected final Logger logger = LoggerFactory.getLogger(Indexing.class);

    /* Perform a commit by batch of COMMIT documents */
    public static int COMMIT = 10000;
    public static boolean STORE = true;
    public static int SKIP_TO = 0;
    public static int PROCESS_BATCH = 100000;

    // FIELDS
    final static public String INCOMING_TRIPLE = "incoming-triple";
    final static public String OUTGOING_TRIPLE = "outgoing-triple";
    final static public String SUBJECT = "subject";
    final static public String TYPE = "type";

    /* The data set files */
    protected final File[] input;
    protected int inputPos = 0;
    /* The current reader into the compressed archive */
    protected GZIPInputStream reader = null;
    protected BufferedReader _rin = null;

    /* the triple hash for the current archive file */
    protected Hashtable<String, HashSet<String>> _outTripleHash = null;
    protected Hashtable<String, HashSet<String>> _inTripleHash = null;
    protected StringBuilder sb = new StringBuilder();
    /* the current entity */
    private final Entity _entity = new Entity();

    Pattern _clearPattern = Pattern.compile("[<>]");

    /* SIREn index */
    protected final Directory indexDir;
    protected final IndexWriter writer;

    /**
     * Create a SIREn index at indexDir, taking the files at inputDir as input.
     * @param inputDir
     * @param dir
     * @throws IOException
     */
    public Indexing(final File inputDir, final Directory dir) throws IOException {
        this.input = inputDir.listFiles(new FilenameFilter() {

            //@Override
            public boolean accept(File dir, String name) {
                return true;
            }
        });

        /*
         *  Sort by filename: important because in the SIndice-ED dataset, two
         *  consecutive dumps can store a same entity
         */
        Arrays.sort(this.input);
        if (this.input.length == 0) {
            throw new RuntimeException("No archive files in the folder: " + inputDir.getAbsolutePath());
        }

        this.indexDir = dir;
        this.writer = initializeIndexWriter(this.indexDir);
        reader = getInputStream(this.input[0]);
        _rin = new BufferedReader(new InputStreamReader(reader));
        logger.info("Creating index from input located at {} ({} files)", inputDir.getAbsolutePath(), input.length);
        logger.info("Reading dump: {}", this.input[0]);

        _inTripleHash = new Hashtable<String, HashSet<String>>();
        _outTripleHash = new Hashtable<String, HashSet<String>>();
    }

    /**
     * The regular expression of the input files
     * @return
     */
    protected abstract String getPattern();

    /**
     * Create a buffered tar inputstream from the file in
     * @param in
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     */
    private GZIPInputStream getInputStream(final File in) throws FileNotFoundException, IOException {
        return new GZIPInputStream(new FileInputStream(in));
    }

    /**
     * Move to the next tar entry.
     * @return true if a next tar entry can be read, or if this entry name is a sub-folder of rootDir
     */
    public boolean moveToNextArchive() {
        try {
            if (++inputPos >= input.length) {
                reader.close();
                return false;
            }
            // Next archive file
            reader.close();
            reader = getInputStream(input[inputPos]);
            _rin = new BufferedReader(new InputStreamReader(reader));
            logger.info("Reading dump: {}", this.input[inputPos]);
        } catch (IOException e) {
            logger.error("Error while reading the input: {}\n{}", input[inputPos], e);
        }

        _inTripleHash.clear();
        _outTripleHash.clear();

        return true;
    }

    /**
     * Create a index writer that uses a #TupleAnalyzer on the triples fields with
     * a tokenization of the URI's localname, and the default #WhitespaceAnalyzer
     * on the others.
     * @param dir
     * @return
     * @throws IOException
     */
    @SuppressWarnings("deprecation")
    private IndexWriter initializeIndexWriter(final Directory dir) throws IOException {
        final Analyzer defaultAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_31);
        final Map<String, Analyzer> fieldAnalyzers = new HashMap<String, Analyzer>();
        final TupleAnalyzer tuple = new TupleAnalyzer(new StandardAnalyzer(Version.LUCENE_31));
        tuple.setURINormalisation(URINormalisation.LOCALNAME);
        fieldAnalyzers.put(OUTGOING_TRIPLE, tuple);
        fieldAnalyzers.put(INCOMING_TRIPLE, tuple);

        final IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_31,
                new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers));

        // Disable compound file
        ((LogMergePolicy) config.getMergePolicy()).setUseCompoundFile(false);
        // Increase merge factor to 20 - more adapted to batch creation
        ((LogMergePolicy) config.getMergePolicy()).setMergeFactor(20);

        config.setRAMBufferSizeMB(256);
        config.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
        config.setMaxBufferedDeleteTerms(IndexWriterConfig.DISABLE_AUTO_FLUSH);

        final IndexWriter writer = new IndexWriter(dir, config);
        writer.setMaxFieldLength(Integer.MAX_VALUE);
        return writer;
    }

    private String HashSetToString(final HashSet<String> set) {
        sb.setLength(0);
        for (String s : set) {
            sb.append(s);
        }
        return sb.toString();
    }

    /**
     * Creates an entity index
     * @throws CorruptIndexException
     * @throws IOException
     */
    public void indexIt() throws CorruptIndexException, IOException {
        long counter = 0;

        do {
            while (true == parseOneBatch()) {
                counter = processOneBatch(counter);
            }
            counter = processOneBatch(counter);

        } while (moveToNextArchive());

        commit(false, counter, _entity.subject); // Commit what is left
        writer.optimize();
    }

    private long processOneBatch(long counter) throws CorruptIndexException, IOException {
        Enumeration<String> em = _outTripleHash.keys();
        while (em.hasMoreElements()) {
            _entity.clear();
            String subject = em.nextElement();
            String outTriples = HashSetToString(_outTripleHash.get(subject));
            Utils.sortAndFlattenNTriples(outTriples, _entity.outTuples, _entity.type, true);
            if (_inTripleHash.containsKey(subject)) {
                String inTriples = HashSetToString(_inTripleHash.get(subject));
                Utils.sortAndFlattenNTriples(inTriples, _entity.inTuples, _entity.type, false);
            }

            Document doc = new Document();
            doc.add(new Field(SUBJECT, trim(subject), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
            doc.add(new Field(TYPE, Utils.toString(_entity.type), Store.YES, Index.ANALYZED_NO_NORMS));
            doc.add(new Field(OUTGOING_TRIPLE, _entity.getTriples(true), Store.YES, Index.ANALYZED_NO_NORMS));
            doc.add(new Field(INCOMING_TRIPLE, _entity.getTriples(false), Store.YES, Index.ANALYZED_NO_NORMS));
            writer.addDocument(doc);
            counter = commit(true, counter, subject);
        }
        return counter;
    }

    /**
     * Commits the documents by batch
     * @param indexing
     * @param counter
     * @param subject
     * @return
     * @throws CorruptIndexException
     * @throws IOException
     */
    private long commit(boolean indexing, long counter, String subject) throws CorruptIndexException, IOException {
        if (!indexing || (++counter % COMMIT) == 0) { // Index by batch
            writer.commit();
            logger.info("Commited {} entities.", (indexing ? COMMIT : counter));
        }
        return counter;
    }

    /**
     * Close resources
     * @throws CorruptIndexException
     * @throws IOException
     */
    public void close() throws CorruptIndexException, IOException {
        try {
            writer.close();
        } finally {
            indexDir.close();
        }
    }

    private boolean isURI(String uri) {
        try {

            Matcher m = _clearPattern.matcher(uri);
            uri = m.replaceAll("");
            new URL(uri);
            return true;
        } catch (MalformedURLException e) {
            return false;
        }
    }

    /*
     * trim string 
     */
    private String trim(String str) {
        Matcher m = _clearPattern.matcher(str);
        str = m.replaceAll("");
        return str;
    }

    /*
     * parse one batch of an archive file
     */
    protected boolean parseOneBatch() {
        StringBuilder tripleSb = new StringBuilder();
        _inTripleHash.clear();
        _outTripleHash.clear();

        int parsed = 0;
        try {
            String line;
            String sub;
            String obj;
            int pos = 1;

            while ((line = _rin.readLine()) != null) {
                String[] fields = line.split("\\s");
                if (fields.length < 3) {
                    System.err.println("Invalid record of at N Triple" + pos);
                    continue;
                }

                sub = fields[0];

                if (false == isURI(sub)) {
                    continue;
                }

                // transfer data from nQuard to nTriple
                tripleSb.setLength(0);
                for (int i = 0; i < fields.length - 2; ++i) {
                    tripleSb.append(fields[i]).append(' ');
                }
                tripleSb.append(".\n");

                // save outgoing triple
                if (_outTripleHash.containsKey(sub)) {
                    HashSet<String> triple = _outTripleHash.get(sub);
                    triple.add(tripleSb.toString());
                    _outTripleHash.put(sub, triple);
                } else {
                    HashSet<String> triple = new HashSet<String>();
                    triple.add(tripleSb.toString());
                    _outTripleHash.put(sub, triple);
                }

                // only if the object field is a URI, it can be treated as incoming triples as well
                if (5 == fields.length) {
                    obj = fields[2];
                    if (true == isURI(obj)) {
                        // save incoming triple
                        if (_inTripleHash.containsKey(obj)) {
                            HashSet<String> triple = _inTripleHash.get(obj);
                            triple.add(tripleSb.toString());
                            _inTripleHash.put(obj, triple);
                        } else {
                            HashSet<String> triple = new HashSet<String>();
                            triple.add(tripleSb.toString());
                            _inTripleHash.put(obj, triple);
                        }
                    }
                }

                ++pos;
                if (++parsed > PROCESS_BATCH) {
                    break;
                }
            }

            if (parsed > PROCESS_BATCH) {
                return true;
            } else {
                _rin.close();
                return false;
            }
        } catch (IOException e) {
            System.err.println("I/O Error" + e);
            e.printStackTrace();
            System.exit(-1);
        } catch (Exception e) {
            System.err.println("Error" + e);
            e.printStackTrace();
            System.exit(-1);
        }
        return false;
    }
}