edu.unika.aifb.graphindex.searcher.entity.EntitySearcher.java Source code

Java tutorial

Introduction

Here is the source code for edu.unika.aifb.graphindex.searcher.entity.EntitySearcher.java

Source

package edu.unika.aifb.graphindex.searcher.entity;

/**
 * Copyright (C) 2009 Lei Zhang (beyondlei at gmail.com)
 * 
 * This file is part of the graphindex project.
 *
 * graphindex is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2
 * as published by the Free Software Foundation.
 * 
 * graphindex is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with graphindex.  If not, see <http://www.gnu.org/licenses/>.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.openrdf.model.vocabulary.RDFS;

import edu.unika.aifb.graphindex.index.IndexDirectory;
import edu.unika.aifb.graphindex.model.IEntity;
import edu.unika.aifb.graphindex.model.impl.Entity;
import edu.unika.aifb.graphindex.searcher.keyword.model.Constant;
import edu.unika.aifb.graphindex.searcher.keyword.model.KeywordElement;
import edu.unika.aifb.graphindex.searcher.keyword.model.TransformedGraph;
import edu.unika.aifb.graphindex.searcher.keyword.model.TransformedGraphNode;
import edu.unika.aifb.graphindex.storage.NeighborhoodStorage;
import edu.unika.aifb.graphindex.storage.StorageException;
import edu.unika.aifb.graphindex.storage.lucene.LuceneNeighborhoodStorage;
import edu.unika.aifb.graphindex.util.TypeUtil;

public class EntitySearcher {

    private NeighborhoodStorage ns;
    private IndexSearcher searcher;
    private IndexReader reader;

    private static final double ENTITY_THRESHOLD = 0.5;
    private static final double SCHEMA_THRESHOLD = 0.8;
    private static final int MAX_KEYWORDRESULT_SIZE = 10;

    private static final String SEPARATOR = ":";

    private static final Logger log = Logger.getLogger(EntitySearcher.class);

    public EntitySearcher(edu.unika.aifb.graphindex.index.IndexReader idxReader) throws StorageException {
        try {
            reader = IndexReader.open(idxReader.getIndexDirectory().getDirectory(IndexDirectory.KEYWORD_DIR));
            searcher = new IndexSearcher(reader);
            ns = new LuceneNeighborhoodStorage(
                    idxReader.getIndexDirectory().getDirectory(IndexDirectory.NEIGHBORHOOD_DIR));
            ns.initialize(false, true);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public void warmUp(Set<String> queries) throws StorageException {
        for (String query : queries) {
            String[] t = query.split(" ", 2);
            Query q = new PrefixQuery(new Term(t[0], t[1]));
            ScoreDoc[] docIds = getDocuments(q);
            //         log.debug("warmup: " + q + " => " + docIds.length + " doc ids");
        }

    }

    public TransformedGraph searchEntities(TransformedGraph graph, int cutOff) {
        for (TransformedGraphNode node : graph.getNodes()) {
            if (node.getType() == TransformedGraphNode.ENTITY_QUERY_NODE) {
                Map<String, Collection<String>> attributeQueries = node.getAttributeQueries();
                if (attributeQueries != null && attributeQueries.keySet().size() != 0)
                    node.setEntities(searchEntities(attributeQueries, node.getTypeQueries(), cutOff));
            } else if (node.getType() == TransformedGraphNode.ENTITY_NODE) {
                node.setEntities(searchEntities(node.getUriQuery()));
            }
            if (node.getEntities() != null)
                log.debug("variable: " + node.getNodeName() + ", entities: " + node.getEntities().size());
            else
                log.debug("variable: " + node.getNodeName() + ", no entities");
        }

        return graph;
    }

    public TransformedGraph searchEntities(TransformedGraph graph) {
        for (TransformedGraphNode node : graph.getNodes()) {
            if (node.getType() == TransformedGraphNode.ENTITY_QUERY_NODE) {
                Map<String, Collection<String>> attributeQueries = node.getAttributeQueries();
                if (attributeQueries != null && attributeQueries.keySet().size() != 0)
                    node.setEntities(searchEntities(attributeQueries, node.getTypeQueries(), 0));
            } else if (node.getType() == TransformedGraphNode.ENTITY_NODE) {
                node.setEntities(searchEntities(node.getUriQuery()));
            }
            if (node.getEntities() != null)
                log.debug("variable: " + node.getNodeName() + ", entities: " + node.getEntities().size());
            else
                log.debug("variable: " + node.getNodeName() + ", no entities");
        }

        return graph;
    }

    public boolean isType(String entity, String concept) {
        TermQuery tq = new TermQuery(new Term(Constant.URI_FIELD, entity));
        try {
            ScoreDoc[] docHits = getDocuments(tq);
            Set<String> loadFieldNames = new HashSet<String>();
            loadFieldNames.add(Constant.URI_FIELD);
            loadFieldNames.add(Constant.TYPE_FIELD);
            loadFieldNames.add(Constant.EXTENSION_FIELD);
            loadFieldNames.add(Constant.CONCEPT_FIELD);
            Set<String> lazyFieldNames = new HashSet<String>();
            lazyFieldNames.add(Constant.NEIGHBORHOOD_FIELD);
            SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(loadFieldNames, lazyFieldNames);

            for (ScoreDoc docHit : docHits) {
                Document doc = reader.document(docHit.doc, fieldSelector);
                String type = doc.getFieldable(Constant.TYPE_FIELD).stringValue();
                if (type == null) {
                    System.err.println("type is null!");
                    continue;
                }

                if (type.equals(TypeUtil.ENTITY)) {
                    if (doc.getFieldable(Constant.CONCEPT_FIELD).stringValue().equals(concept))
                        return true;
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return false;
    }

    public Collection<KeywordElement> searchEntities(Map<String, Collection<String>> attributeQueries,
            Collection<String> typeQueries, int cutOff) {
        Map<String, Collection<String>> attributes = new HashMap<String, Collection<String>>();
        Collection<String> concepts = new ArrayList<String>();
        Collection<KeywordElement> entities = new ArrayList<KeywordElement>();

        searchConcepts(searcher, typeQueries, concepts);
        searchAttributes(searcher, attributeQueries.keySet(), attributes);
        if (attributes != null && attributes.size() != 0)
            searchEntitiesByAttributeVauleCompounds(searcher, attributeQueries, attributes, concepts, entities,
                    cutOff);

        return entities;
    }

    public Collection<KeywordElement> searchEntities(String uriQuery) {
        Collection<KeywordElement> entities = new ArrayList<KeywordElement>();

        searchEntitiesByUri(searcher, uriQuery, entities);

        return entities;
    }

    public void searchConcepts(IndexSearcher searcher, Collection<String> queries, Collection<String> concepts) {
        if (queries != null && queries.size() != 0) {
            try {
                // search schema elements
                StandardAnalyzer analyzer = new StandardAnalyzer();
                for (String keyword : queries) {
                    Query q;
                    if (keyword.startsWith(Constant.URI_PREFIX)) {
                        concepts.add(keyword);
                    } else {
                        QueryParser parser = new QueryParser(Constant.SCHEMA_FIELD, analyzer);
                        q = parser.parse(keyword);
                        Collection<String> tmp = searchConceptWithClause(searcher, q);
                        if (tmp != null && tmp.size() != 0) {
                            concepts.addAll(tmp);
                        }
                    }
                }
            } catch (ParseException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    public Collection<String> searchConceptWithClause(IndexSearcher searcher, Query clause) {
        Collection<String> result = new HashSet<String>();
        try {
            Hits hits = searcher.search(clause);
            /********* add fuzzy query funtion here **************/
            if (hits == null || hits.length() == 0) {
                Set<Term> terms = new HashSet<Term>();
                clause.extractTerms(terms);
                //if clause query is a term query
                if (terms.size() != 0) {
                    BooleanQuery query = new BooleanQuery();
                    for (Term term : terms) {
                        query.add(new FuzzyQuery(term, 0.8f, 1), Occur.MUST);
                    }
                    hits = searcher.search(query);
                    log.debug(query + " " + hits);
                }
            }
            /************************************************/

            for (int i = 0; i < hits.length(); i++) {
                Document doc = hits.doc(i);
                float score = hits.score(i);
                if (score >= SCHEMA_THRESHOLD) {
                    String type = doc.get(Constant.TYPE_FIELD);
                    if (type == null) {
                        System.err.println("type is null!");
                        continue;
                    }

                    if (type.equals(TypeUtil.CONCEPT)) {
                        result.add(doc.get(Constant.URI_FIELD));
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }

    public void searchAttributes(IndexSearcher searcher, Collection<String> queries,
            Map<String, Collection<String>> attributes) {
        try {
            // search schema elements
            StandardAnalyzer analyzer = new StandardAnalyzer();
            for (String keyword : queries) {
                Query q;
                Collection<String> tmp = null;
                if (keyword.startsWith(Constant.URI_PREFIX)) {
                    tmp = new ArrayList<String>();
                    if (keyword.equals(RDFS.LABEL.stringValue()))
                        tmp.add(Constant.LABEL_FIELD);
                    else
                        tmp.add(keyword);
                } else if (keyword.equals(Constant.LABEL_FIELD) || keyword.equals(Constant.LOCALNAME_FIELD)
                        || keyword.equals(Constant.CONCEPT_FIELD)) {
                    tmp = new ArrayList<String>();
                    tmp.add(keyword);
                } else {
                    QueryParser parser = new QueryParser(Constant.SCHEMA_FIELD, analyzer);
                    q = parser.parse(keyword);
                    tmp = searchAttributesWithClause(searcher, q);
                }

                if (tmp != null && tmp.size() != 0) {
                    for (String resource : tmp) {
                        Collection<String> coll = attributes.get(keyword);
                        if (coll == null) {
                            coll = new ArrayList<String>();
                            attributes.put(keyword, coll);
                        }
                        coll.add(resource);
                    }
                }
            }
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public Collection<String> searchAttributesWithClause(IndexSearcher searcher, Query clause) {
        Collection<String> result = new HashSet<String>();
        try {
            Hits hits = searcher.search(clause);
            /********* add fuzzy query funtion here **************/
            if (hits == null || hits.length() == 0) {
                Set<Term> terms = new HashSet<Term>();
                clause.extractTerms(terms);
                //if clause query is a term query
                if (terms.size() != 0) {
                    BooleanQuery query = new BooleanQuery();
                    for (Term term : terms) {
                        query.add(new FuzzyQuery(term, 0.8f, 1), Occur.MUST);
                    }
                    hits = searcher.search(query);
                    log.debug(query + " " + hits);
                }
            }
            /************************************************/

            for (int i = 0; i < hits.length(); i++) {
                Document doc = hits.doc(i);
                float score = hits.score(i);
                if (score >= SCHEMA_THRESHOLD) {
                    String type = doc.get(Constant.TYPE_FIELD);
                    if (type == null) {
                        System.err.println("type is null!");
                        continue;
                    }

                    if (type.equals(TypeUtil.ATTRIBUTE)) {
                        result.add(doc.get(Constant.URI_FIELD));
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }

    private void searchEntitiesByUri(IndexSearcher searcher, String query, Collection<KeywordElement> entities) {
        TermQuery tq = new TermQuery(new Term(Constant.URI_FIELD, query));
        try {
            ScoreDoc[] docHits = getDocuments(tq);
            Set<String> loadFieldNames = new HashSet<String>();
            loadFieldNames.add(Constant.URI_FIELD);
            loadFieldNames.add(Constant.TYPE_FIELD);
            loadFieldNames.add(Constant.EXTENSION_FIELD);
            loadFieldNames.add(Constant.CONCEPT_FIELD);
            Set<String> lazyFieldNames = new HashSet<String>();
            lazyFieldNames.add(Constant.NEIGHBORHOOD_FIELD);
            SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(loadFieldNames, lazyFieldNames);

            for (ScoreDoc docHit : docHits) {
                Document doc = reader.document(docHit.doc, fieldSelector);
                float score = docHit.score;
                String type = doc.getFieldable(Constant.TYPE_FIELD).stringValue();
                if (type == null) {
                    System.err.println("type is null!");
                    continue;
                }

                if (type.equals(TypeUtil.ENTITY)) {
                    IEntity ent = new Entity(pruneString(doc.getFieldable(Constant.URI_FIELD).stringValue()),
                            doc.getFieldable(Constant.EXTENSION_FIELD).stringValue());
                    KeywordElement ele = new KeywordElement(ent, KeywordElement.ENTITY, doc, score, ns);
                    entities.add(ele);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private void searchEntitiesByAttributeVauleCompounds(IndexSearcher searcher,
            Map<String, Collection<String>> queries, Map<String, Collection<String>> attributes,
            Collection<String> concepts, Collection<KeywordElement> entities, int cutOff) {
        BooleanQuery entityQuery = new BooleanQuery();
        try {
            StandardAnalyzer analyzer = new StandardAnalyzer();
            for (String keywordForAttribute : queries.keySet()) {
                BooleanQuery attributeQuery = new BooleanQuery();
                Collection<String> attributeElements = attributes.get(keywordForAttribute);
                if (attributeElements != null && attributeElements.size() != 0)
                    for (String attribute : attributeElements) {
                        QueryParser parser = new QueryParser(attribute, analyzer);
                        BooleanQuery bq = new BooleanQuery();
                        for (String value : queries.get(keywordForAttribute)) {
                            Query q = parser.parse(value);
                            if (q instanceof BooleanQuery) {
                                BooleanQuery bquery = (BooleanQuery) q;
                                for (BooleanClause clause : bquery.getClauses()) {
                                    clause.setOccur(Occur.MUST);
                                }
                            }
                            bq.add(q, BooleanClause.Occur.MUST);
                        }
                        attributeQuery.add(bq, BooleanClause.Occur.SHOULD);
                    }
                entityQuery.add(attributeQuery, BooleanClause.Occur.MUST);
            }
            if (concepts != null && concepts.size() != 0) {
                BooleanQuery typeQuery = new BooleanQuery();
                for (String type : concepts) {
                    TermQuery tq = new TermQuery(new Term(Constant.CONCEPT_FIELD, type));
                    typeQuery.add(tq, BooleanClause.Occur.MUST);
                }
                entityQuery.add(typeQuery, BooleanClause.Occur.MUST);
            }

            searchEntitiesWithClause(searcher, entityQuery, entities, cutOff);
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public Collection<KeywordElement> searchEntitiesWithClause(IndexSearcher searcher, Query query,
            Collection<KeywordElement> result, int cutOff) {
        try {
            Set<String> loadFieldNames = new HashSet<String>();
            loadFieldNames.add(Constant.URI_FIELD);
            loadFieldNames.add(Constant.TYPE_FIELD);
            loadFieldNames.add(Constant.EXTENSION_FIELD);
            Set<String> lazyFieldNames = new HashSet<String>();
            lazyFieldNames.add(Constant.NEIGHBORHOOD_FIELD);
            SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(loadFieldNames, lazyFieldNames);

            ScoreDoc[] docHits;
            if (cutOff > 0)
                docHits = getTopDocuments(query, cutOff);
            else
                docHits = getDocuments(query);

            for (ScoreDoc docHit : docHits) {
                Document doc = reader.document(docHit.doc, fieldSelector);
                float score = docHit.score;
                String type = doc.getFieldable(Constant.TYPE_FIELD).stringValue();
                if (type == null) {
                    System.err.println("type is null!");
                    continue;
                }

                if (type.equals(TypeUtil.ENTITY)) {
                    IEntity ent = new Entity(pruneString(doc.getFieldable(Constant.URI_FIELD).stringValue()),
                            doc.getFieldable(Constant.EXTENSION_FIELD).stringValue());
                    KeywordElement ele = new KeywordElement(ent, KeywordElement.ENTITY, doc, score, ns);
                    result.add(ele);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }

    public ScoreDoc[] getDocuments(Query q) throws StorageException {
        final List<ScoreDoc> docs = new ArrayList<ScoreDoc>();
        try {
            searcher.search(q, new HitCollector() {
                public void collect(int docId, float score) {
                    docs.add(new ScoreDoc(docId, score));
                }
            });
        } catch (IOException e) {
            throw new StorageException(e);
        }

        return docs.toArray(new ScoreDoc[docs.size()]);
    }

    public ScoreDoc[] getTopDocuments(Query q, int top) throws StorageException {
        ScoreDoc[] docs;
        try {
            TopDocCollector collector = new TopDocCollector(top);
            searcher.search(q, collector);
            docs = collector.topDocs().scoreDocs;
        } catch (IOException e) {
            throw new StorageException(e);
        }

        return docs;
    }

    private String pruneString(String str) {
        return str.replaceAll("\"", "");
    }

    public static Map<String, Collection<String>> parseQueries(Collection<String> queries) {
        Map<String, Collection<String>> keywordCompounds = new HashMap<String, Collection<String>>();
        for (String query : queries) {
            String[] strs = query.trim().split(SEPARATOR);
            if (strs.length == 2) {
                Collection<String> values = keywordCompounds.get(strs[0]);
                if (values == null) {
                    values = new HashSet<String>();
                    keywordCompounds.put(strs[0], values);
                }
                values.add(strs[1]);
            } else {
                log.error("--------------------- ERROR! ---------------------");
            }
        }

        return keywordCompounds;
    }

    public static LinkedList<String> getKeywordList(String line) {
        LinkedList<String> ll = new LinkedList<String>();

        // Boolean set to true if a " is opened
        Boolean opened = false;
        // Temporary string
        String acc = "";
        // Browse the string
        for (int i = 0; i < line.length(); i++) {
            // Get the character
            String str = String.valueOf(line.charAt(i));
            // If it is an opening "
            if (str.equals("\"") && !opened) {
                opened = true;
                continue;
            }
            // If it is a closing "
            if (str.equals("\"") && opened) {
                opened = false;
                // Put the acc string into the list
                ll.add(acc);
                acc = "";
                continue;
            }
            // If it is a space not between "
            if (str.equals(" ") && !opened) {
                if (acc != "") {
                    ll.add(acc);
                    acc = "";
                }
                continue;
            }
            // If it is a space between "
            if (str.equals(" ") && opened) {
                acc += " ";
                continue;
            }
            // Else, add the char
            acc += str;
        }
        if (!acc.equals(""))
            ll.add(acc);

        return ll;
    }
}