edu.unika.aifb.graphindex.index.KeywordIndexBuilder.java Source code

Java tutorial

Introduction

Here is the source code for edu.unika.aifb.graphindex.index.KeywordIndexBuilder.java

Source

package edu.unika.aifb.graphindex.index;

/**
 * Copyright (C) 2009 Lei Zhang (beyondlei at gmail.com)
 * Copyright (C) 2009 Gnter Ladwig (gla at aifb.uni-karlsruhe.de)
 * 
 * This file is part of the graphindex project.
 *
 * graphindex is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2
 * as published by the Free Software Foundation.
 * 
 * graphindex is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with graphindex.  If not, see <http://www.gnu.org/licenses/>.
 */

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hit;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
import org.ho.yaml.Yaml;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.model.vocabulary.RDFS;

import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;

import edu.unika.aifb.graphindex.algorithm.largercp.BlockCache;
import edu.unika.aifb.graphindex.data.Table;
import edu.unika.aifb.graphindex.searcher.keyword.analyzer.CapitalizationSplitterAnalyzer;
import edu.unika.aifb.graphindex.searcher.keyword.analyzer.DictionaryCompoundWordAnalyzer;
import edu.unika.aifb.graphindex.searcher.keyword.analyzer.HyphenationCompoundWordAnalyzer;
import edu.unika.aifb.graphindex.searcher.keyword.model.Constant;
import edu.unika.aifb.graphindex.storage.DataField;
import edu.unika.aifb.graphindex.storage.IndexDescription;
import edu.unika.aifb.graphindex.storage.NeighborhoodStorage;
import edu.unika.aifb.graphindex.storage.StorageException;
import edu.unika.aifb.graphindex.storage.keyword.BloomFilter;
import edu.unika.aifb.graphindex.storage.lucene.LuceneNeighborhoodStorage;
import edu.unika.aifb.graphindex.util.TypeUtil;
import edu.unika.aifb.graphindex.util.Util;

public class KeywordIndexBuilder {

    private static final float CONCEPT_BOOST = 2.0f;
    private static final float RELATION_BOOST = 1.0f;
    private static final float ATTRIBUTE_BOOST = 1.0f;
    private static final float ENTITY_BOOST = 1.0f;
    private static final float ENTITY_DISCRIMINATIVE_BOOST = 5.0f;
    private static final float ENTITY_DESCRIPTIVE_BOOST = 1.0f;

    private static int HOP;
    private static int MAXFIELDLENGTH = 100;

    private static double FALSE_POSITIVE = 0.001;

    private IndexDirectory idxDirectory;
    private IndexConfiguration idxConfig;
    protected DataIndex dataIndex;
    private BlockCache blockSearcher;
    private NeighborhoodStorage ns;
    private Set<String> objectProperties;
    private Set<String> relations;
    private Set<String> attributes;
    private boolean resume = false;
    private Set<String> properties;
    private IndexWriter valueWriter;
    //   private Map<String,Double> propertyWeights;

    private static final Logger log = Logger.getLogger(KeywordIndexBuilder.class);

    public KeywordIndexBuilder(edu.unika.aifb.graphindex.index.IndexReader idxReader, boolean resume)
            throws IOException, StorageException {
        this.idxDirectory = idxReader.getIndexDirectory();
        this.idxConfig = idxReader.getIndexConfiguration();
        this.dataIndex = idxReader.getDataIndex();
        this.resume = resume;

        this.ns = new LuceneNeighborhoodStorage(
                idxDirectory.getDirectory(IndexDirectory.NEIGHBORHOOD_DIR, !resume));
        this.ns.initialize(!resume, false);

        //      this.propertyWeights = (Map<String,Double>)Yaml.load(idxDirectory.getFile(IndexDirectory.PROPERTY_FREQ_FILE));
        //      log.debug(propertyWeights);

        log.info("resume: " + resume);

        HOP = idxConfig.getInteger(IndexConfiguration.KW_NSIZE);

        if (idxConfig.getBoolean(IndexConfiguration.HAS_SP)) {
            try {
                this.blockSearcher = new BlockCache(idxDirectory);
            } catch (DatabaseException e) {
                e.printStackTrace();
            }
        }

    }

    public void indexKeywords() throws StorageException, IOException {
        File indexDir = idxDirectory.getDirectory(IndexDirectory.KEYWORD_DIR, !resume);
        File valueDir = idxDirectory.getDirectory(IndexDirectory.VALUE_DIR, !resume);

        this.objectProperties = Util.readEdgeSet(idxDirectory.getFile(IndexDirectory.OBJECT_PROPERTIES_FILE));
        this.relations = Util.readEdgeSet(idxDirectory.getTempFile("relations", false));
        this.attributes = Util.readEdgeSet(idxDirectory.getTempFile("attributes", false));
        properties = new HashSet<String>();
        properties.addAll(relations);
        properties.addAll(attributes);

        log.debug("attributes: " + attributes.size() + ", relations: " + relations.size());

        try {
            //         HyphenationCompoundWordAnalyzer analyzer = new HyphenationCompoundWordAnalyzer("./res/en_hyph_US.xml", "./res/en_US.dic");
            //         DictionaryCompoundWordAnalyzer analyzer = new DictionaryCompoundWordAnalyzer("./res/en_US.dic");
            CapitalizationSplitterAnalyzer analyzer = new CapitalizationSplitterAnalyzer();
            StandardAnalyzer valueAnalyzer = new StandardAnalyzer();
            IndexWriter indexWriter = new IndexWriter(indexDir, analyzer, !resume,
                    new MaxFieldLength(MAXFIELDLENGTH));
            log.debug("max terms per field: " + indexWriter.getMaxFieldLength());

            valueWriter = new IndexWriter(valueDir, valueAnalyzer, !resume, new MaxFieldLength(MAXFIELDLENGTH));

            org.apache.lucene.index.IndexReader reader = null;
            if (resume) {
                reader = org.apache.lucene.index.IndexReader.open(FSDirectory.getDirectory(indexDir), true);
                log.debug("docs: " + reader.numDocs());
            }

            if (!resume) {
                log.info("Indexing concepts");
                indexSchema(indexWriter, idxDirectory.getTempFile("concepts", false), TypeUtil.CONCEPT,
                        CONCEPT_BOOST);

                log.info("Indexing attributes");
                indexSchema(indexWriter, idxDirectory.getTempFile("attributes", false), TypeUtil.ATTRIBUTE,
                        ATTRIBUTE_BOOST);

                log.info("Indexing relations");
                indexSchema(indexWriter, idxDirectory.getTempFile("relations", false), TypeUtil.RELATION,
                        RELATION_BOOST);
            }

            log.info("Indexing entities");
            indexEntity(indexWriter, idxDirectory.getTempFile("entities", false), reader);

            indexWriter.commit();
            valueWriter.commit();

            log.debug("optimizing...");
            indexWriter.optimize();
            valueWriter.optimize();

            indexWriter.close();
            valueWriter.close();

            if (blockSearcher != null)
                blockSearcher.close();
            ns.optimize();
            ns.close();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (DatabaseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    protected void indexSchema(IndexWriter indexWriter, File file, String type, float boost)
            throws StorageException {
        try {
            BufferedReader br = new BufferedReader(new FileReader(file));
            String line;
            while ((line = br.readLine()) != null) {
                String tokens[] = line.split("\t");
                String uri, localName;
                if (tokens.length == 1) {
                    uri = line.trim();
                    localName = TypeUtil.getLocalName(uri).trim();
                } else {
                    continue;
                }

                /* Write Index */
                Document doc = new Document();

                // indexing type of the element
                doc.add(new Field(Constant.TYPE_FIELD, type, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

                // indexing local name
                doc.add(new Field(Constant.SCHEMA_FIELD, localName, Field.Store.YES, Field.Index.ANALYZED,
                        Field.TermVector.NO));

                // indexing label 
                Table<String> table = dataIndex.getTriples(uri, RDFS.LABEL.stringValue(), null);
                for (String[] row : table) {
                    doc.add(new Field(Constant.SCHEMA_FIELD, row[2], Field.Store.YES, Field.Index.ANALYZED,
                            Field.TermVector.NO));
                }

                // indexing uri
                doc.add(new Field(Constant.URI_FIELD, uri, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

                // indexing extension id for concept
                if (type.equals(TypeUtil.CONCEPT) && idxConfig.getBoolean(IndexConfiguration.HAS_SP)) {
                    String blockName = blockSearcher.getBlockName(uri);
                    doc.add(new Field(Constant.EXTENSION_FIELD, blockName, Field.Store.YES, Field.Index.NO));
                }
                doc.setBoost(boost);
                indexWriter.addDocument(doc);
            }
            br.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    protected List<Field> getFieldsForEntity(String uri) throws IOException, StorageException {
        List<Field> fields = new ArrayList<Field>();

        // indexing type of the element
        fields.add(new Field(Constant.TYPE_FIELD, TypeUtil.ENTITY, Field.Store.YES, Field.Index.NO));

        // indexing uri
        fields.add(new Field(Constant.URI_FIELD, uri, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

        // indexing extension id 
        //      if (idxConfig.getBoolean(IndexConfiguration.HAS_SP)) {
        //         String blockName = blockSearcher.getBlockName(uri);
        ////         String blockName = structureIndex.getExtension(uri);
        //         if (blockName == null) {
        //            log.warn("no ext for " + uri);
        //            return null;
        //         }
        //         fields.add(new Field(Constant.EXTENSION_FIELD, blockName, Field.Store.YES, Field.Index.NO));
        //      }

        // indexing concept of the entity element
        //      Set<String> concepts = computeConcepts(uri);
        //      if(concepts != null && concepts.size() != 0) {
        //         for(String concept : concepts) {
        //            Field field = new Field(Constant.CONCEPT_FIELD, concept, Field.Store.YES, Field.Index.NO);
        //            fields.add(field);
        //         }
        //      }

        // indexing entity descriptions in value index
        computeEntityDescriptions(uri);

        List<Field> propertyFields = computeProperties(uri);
        fields.addAll(propertyFields);

        // indexing reachable entities
        Set<String> reachableEntities = computeReachableEntities(uri);
        //      BloomFilter bf = new BloomFilter(reachableEntities.size(), NUMBER_HASHFUNCTION);
        BloomFilter bf = new BloomFilter(reachableEntities.size(), FALSE_POSITIVE);
        for (String entity : reachableEntities) {
            if (entity.startsWith("http://www."))
                entity = entity.substring(11);
            else if (entity.startsWith("http://"))
                entity = entity.substring(7);
            bf.add(entity);
        }

        ns.addNeighborhoodBloomFilter(uri, bf);

        return fields;
    }

    private void indexEntity(IndexWriter indexWriter, File file, IndexReader reader)
            throws IOException, StorageException {
        try {
            BufferedReader br = new BufferedReader(new FileReader(file));
            String line;
            int entities = 0;
            double time = System.currentTimeMillis();
            while ((line = br.readLine()) != null) {
                String uri = line.trim();

                if (reader != null) {
                    TermDocs td = reader.termDocs(new Term(Constant.URI_FIELD, uri));
                    if (td.next())
                        continue;
                }

                Document doc = new Document();

                List<Field> fields = getFieldsForEntity(uri);

                if (fields == null)
                    continue;

                for (Field f : fields)
                    doc.add(f);

                indexWriter.addDocument(doc);

                //            indexWriter.commit();

                entities++;
                if (entities % 100000 == 0) {
                    indexWriter.commit();
                    valueWriter.commit();
                    ns.commit();
                    log.debug("entities indexed: " + entities + " avg: "
                            + ((System.currentTimeMillis() - time) / 100000.0));
                    time = System.currentTimeMillis();
                }
            }
            br.close();

            log.debug(entities + " entities indexed");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private Set<String> computeConcepts(String entityUri) throws IOException, StorageException {
        Set<String> set = new HashSet<String>();

        IndexDescription idx = dataIndex.getSuitableIndex(DataField.PROPERTY, DataField.SUBJECT);
        set = dataIndex.getIndexStorage(idx).getDataSet(idx, DataField.OBJECT,
                idx.createValueArray(DataField.PROPERTY, RDF.TYPE.stringValue(), DataField.SUBJECT, entityUri));

        return set;
    }

    private Set<String> computeLabels(String entityUri) throws IOException, StorageException {
        Set<String> set = new HashSet<String>();

        IndexDescription idx = dataIndex.getSuitableIndex(DataField.PROPERTY, DataField.SUBJECT);
        set = dataIndex.getIndexStorage(idx).getDataSet(idx, DataField.OBJECT,
                idx.createValueArray(DataField.PROPERTY, RDFS.LABEL.stringValue(), DataField.SUBJECT, entityUri));

        return set;
    }

    private void computeEntityDescriptions(String entityUri) throws IOException, StorageException {
        String ext = blockSearcher.getBlockName(entityUri);

        //      Document doc = new Document();
        //      doc.add(new Field(Constant.URI_FIELD, entityUri, Field.Store.YES, Field.Index.NO));
        //      doc.add(new Field(Constant.EXTENSION_FIELD, ext, Field.Store.YES, Field.Index.NO));
        //      doc.add(new Field(Constant.ATTRIBUTE_FIELD, Constant.ATTRIBUTE_LOCALNAME, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
        //      Field f = new Field(Constant.CONTENT_FIELD, TypeUtil.getLocalName(entityUri).trim(), Field.Store.NO, Field.Index.ANALYZED);
        //      doc.add(f);
        //      doc.setBoost(ENTITY_DISCRIMINATIVE_BOOST);
        //      valueWriter.addDocument(doc);

        Table<String> table = dataIndex.getTriples(entityUri, null, null);
        if (table.rowCount() == 0)
            return;
        for (String[] row : table) {
            String predicate = row[1];
            String object = row[2];

            if (!properties.contains(predicate) || relations.contains(predicate)) // TODO maybe index relations
                continue;

            Document doc = new Document();
            doc.add(new Field(Constant.URI_FIELD, entityUri, Field.Store.YES, Field.Index.NO));
            doc.add(new Field(Constant.ATTRIBUTE_FIELD, predicate, Field.Store.YES,
                    Field.Index.NOT_ANALYZED_NO_NORMS));
            doc.add(new Field(Constant.EXTENSION_FIELD, ext, Field.Store.YES, Field.Index.NO));

            float boost = ENTITY_DESCRIPTIVE_BOOST;
            if (predicate.equals(RDFS.LABEL.toString()))
                boost = ENTITY_DISCRIMINATIVE_BOOST;
            if (TypeUtil.getLocalName(predicate).contains("name"))
                boost = ENTITY_DISCRIMINATIVE_BOOST;

            //         boost *= propertyWeights.get(predicate);
            doc.setBoost(boost);

            if (attributes.contains(predicate)) {
                Field f = new Field(Constant.CONTENT_FIELD, object, Field.Store.NO, Field.Index.ANALYZED);
                //            f.setBoost(boost);
                doc.add(f);
            } else if (relations.contains(predicate)) {
                String localname = TypeUtil.getLocalName(object).trim();

                Field f = new Field(Constant.CONTENT_FIELD, localname, Field.Store.NO, Field.Index.ANALYZED);
                //            f.setBoost(boost);
                doc.add(f);
            }

            valueWriter.addDocument(doc);
        }

    }

    private List<Field> computeProperties(String uri) throws StorageException {
        Set<String> inProperties = new HashSet<String>(), outProperties = new HashSet<String>();

        Table<String> table = dataIndex.getTriples(uri, null, null);
        if (table != null && table.rowCount() > 0) {
            for (String[] row : table) {
                if (objectProperties.contains(row[1]))
                    outProperties.add(row[1]);
            }
        }

        table = dataIndex.getTriples(null, null, uri);
        if (table != null && table.rowCount() > 0) {
            for (String[] row : table) {
                if (objectProperties.contains(row[1]))
                    inProperties.add(row[1]);
            }
        }

        // most entities have type anyway
        outProperties.remove(RDF.TYPE.toString());

        List<Field> fields = new ArrayList<Field>();

        StringBuilder sb = new StringBuilder();
        for (String property : inProperties)
            sb.append(property).append('\n');
        fields.add(new Field(Constant.IN_PROPERTIES_FIELD, sb.toString(), Field.Store.COMPRESS, Field.Index.NO));

        sb = new StringBuilder();
        for (String property : outProperties)
            sb.append(property).append('\n');
        fields.add(new Field(Constant.OUT_PROPERTIES_FIELD, sb.toString(), Field.Store.COMPRESS, Field.Index.NO));

        return fields;
    }

    private Set<String> computeNeighbors(String entityUri) throws IOException, StorageException {
        HashSet<String> set = new HashSet<String>(500);

        Table<String> table = dataIndex.getTriples(entityUri, null, null);

        for (String[] row : table) {
            String predicate = row[1];
            String object = row[2];
            if (relations.contains(predicate)) {
                set.add(object);
            }
        }

        table = dataIndex.getTriples(null, null, entityUri);
        for (String[] row : table) {
            String predicate = row[1];
            String subject = row[0];
            if (relations.contains(predicate))
                set.add(subject);
        }

        return set;
    }

    private Set<String> computeReachableEntities(String entityUri) throws IOException, StorageException {
        HashSet<String> reachableEntities = new HashSet<String>();
        HashSet<String> currentLayer = new HashSet<String>();
        HashSet<String> nextLayer = new HashSet<String>();
        currentLayer.add(entityUri);

        int i = 0;
        while (i < HOP && !currentLayer.isEmpty()) {
            for (String entity : currentLayer) {
                Set<String> neighbors = computeNeighbors(entity);
                if (neighbors != null && neighbors.size() != 0) {
                    for (String neighbor : neighbors) {
                        if (!reachableEntities.contains(neighbor) && !neighbor.equals(entityUri)) {
                            reachableEntities.add(neighbor);
                            nextLayer.add(neighbor);
                        }
                    }
                }
            }
            currentLayer = nextLayer;
            nextLayer = new HashSet<String>();
            i++;
        }
        reachableEntities.add(entityUri);
        return reachableEntities;
    }

}