edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.CollaborativeIndex.java Source code

Introduction

Here is the source code for edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.CollaborativeIndex.java
Source

/**
 * 
 * Copyright (c) 2012 - 2014 Carnegie Mellon University
 * 
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at
    
  http://www.apache.org/licenses/LICENSE-2.0
    
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.
 * 
 * @author Wei Zhang,  Language Technology Institute, School of Computer Science, Carnegie-Mellon University.
 * email: wei.zhang@cs.cmu.edu
 *
 * 
 */

package edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;

import edu.cmu.geolocator.GlobalParam;
import edu.cmu.geolocator.io.GetReader;
import edu.cmu.geolocator.model.CandidateAndFeature;
import edu.cmu.geolocator.model.LocEntityAnnotation;
import edu.cmu.geolocator.resource.ResourceFactory;
import edu.cmu.geolocator.resource.gazindexing.Index;

public class CollaborativeIndex implements Index {

    private IndexSearcher stringSearcher, infoSearcher;

    private HashSet<String> ids;

    private BooleanQuery q;

    private ArrayList<Document> returnDocs;

    private String stringIndexName, infoIndexName, stringLoad, infoLoad;

    private static CollaborativeIndex ci;

    public static CollaborativeIndex getInstance() {
        if (ci == null)
            ci = new CollaborativeIndex().config(GlobalParam.getGazIndex() + "/StringIndex",
                    GlobalParam.getGazIndex() + "/InfoIndex", "mmap", "mmap").open();
        return ci;

    }

    public CollaborativeIndex config(String stringIndexName, String infoIndexName, String stringLoad,
            String infoLoad) {
        this.stringIndexName = stringIndexName;
        this.infoIndexName = infoIndexName;
        this.stringLoad = stringLoad;
        this.infoLoad = infoLoad;
        return this;
    }

    /*
     * public boolean inIndexStrict(String phrase) { if (phrase == null || phrase.length() == 0) throw
     * new NullPointerException(); TermQuery query = new TermQuery(new Term("LOWERED_ORIGIN",
     * phrase.toLowerCase())); //If it's an abbreviation, then return true; ignore case.
     * 
     * if (ResourceFactory.getCountryCode2CountryMap().isCountryAbbreviation(phrase)) return true;
     * TopDocs res = null; try { res = stringSearcher.search(query, 1); } catch (IOException e) { //
     * TODO Auto-generated catch block e.printStackTrace(); } if (res == null) return false; else
     * return res.totalHits > 0 ? true : false; }
     */
    /**
     * Check if the original string is in index. If not, check the non-space version. However, we have
     * to add some heuristics.
     */
    @Override
    public boolean inIndex(String phrase) {
        if (phrase == null || phrase.length() == 0)
            throw new NullPointerException();
        if (phrase.startsWith("#"))
            phrase = phrase.substring(1);
        if (ResourceFactory.getCountryCode2CountryMap().isInMap(phrase.trim().toLowerCase()))
            return true;
        phrase = phrase.toLowerCase().replace(" ", "");
        TermQuery query = new TermQuery(new Term("LOWERED-NO-WS", phrase));
        TopDocs res = null;
        try {
            res = stringSearcher.search(query, 1);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        if (res == null)
            return false;
        return res.totalHits > 0 ? true : false;
    }

    @Override
    public ArrayList<Document> getDocumentsByPhrase(String phrase) {
        if (phrase == null || phrase.length() == 0)
            throw new NullPointerException();
        if (phrase.startsWith("#"))
            phrase = phrase.substring(1);
        TermQuery query = new TermQuery(new Term("LOWERED-NO-WS", phrase.toLowerCase().replace(" ", "")));
        TopDocs res = null;
        try {
            res = stringSearcher.search(query, 2500);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        String abIds = null;
        if (ResourceFactory.getCountryCode2CountryMap().isInMap(phrase.toLowerCase()))
            abIds = ResourceFactory.getCountryCode2CountryMap().getValue(phrase.toLowerCase()).getId();

        // System.out.println(res.totalHits);
        if (res == null && abIds == null)
            return null;
        if (res.totalHits == 0 && abIds == null)
            return null;
        ids = new HashSet<String>();
        if (res != null)
            try {
                for (ScoreDoc doc : res.scoreDocs) {
                    ids.add(stringSearcher.doc(doc.doc).get("ID"));
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        if (abIds != null)
            ids.add(abIds);
        // System.out.println(ids);
        // System.out.println("total number of String ids are:" + ids.size());
        q = new BooleanQuery();

        for (String id : ids) {
            q.add(new TermQuery(new Term("ID", id)), Occur.SHOULD);
        }
        // use a term filter instead of a query filter.
        try {
            TopDocs docs = infoSearcher.search(q, 2500);
            // System.out.println("total hits in info is:" + docs.totalHits);
            returnDocs = new ArrayList<Document>(docs.totalHits);
            for (ScoreDoc d : docs.scoreDocs) {
                returnDocs.add(infoSearcher.doc(d.doc));
            }
            return returnDocs;
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return null;
    }

    /*
     * public ArrayList<Document> getDocumentsByPhraseStrict(String phrase) { if (phrase == null ||
     * phrase.length() == 0) throw new NullPointerException(); TermQuery query = new TermQuery(new
     * Term("LOWERED_ORIGIN", phrase.toLowerCase())); TopDocs res = null; try { res =
     * stringSearcher.search(query, 200); } catch (IOException e) { // TODO Auto-generated catch block
     * e.printStackTrace(); } System.out.println(res.totalHits);
     * 
     * if (res == null) return null; if (res.totalHits == 0) return null; ids = new
     * HashSet<String>(res.totalHits); try { for (ScoreDoc doc : res.scoreDocs) {
     * ids.add(stringSearcher.doc(doc.doc).get("ID")); } } catch (Exception e) { e.printStackTrace();
     * } if (ResourceFactory.getCountryCode2CountryMap().isCountryAbbreviation(phrase))
     * ids.add(ResourceFactory.getCountryCode2CountryMap().getValue(phrase).getId()); //
     * System.out.println("total number of String ids are:" + ids.size()); q = new BooleanQuery();
     * 
     * for (String id : ids) { q.add(new TermQuery(new Term("ID", id)), Occur.SHOULD); } // use a term
     * filter instead of a query filter. try { TopDocs docs = infoSearcher.search(q, 200); //
     * System.out.println("total hits in info is:" + docs.totalHits); returnDocs = new
     * ArrayList<Document>(docs.totalHits); for (ScoreDoc d : docs.scoreDocs) {
     * returnDocs.add(infoSearcher.doc(d.doc)); } return returnDocs; } catch (IOException e) { // TODO
     * Auto-generated catch block e.printStackTrace(); } return null; }
     */
    public CollaborativeIndex open() {
        try {
            stringSearcher = GetReader.getIndexSearcher(stringIndexName, stringLoad);
            // for setting the max clause count for search query.
            BooleanQuery.setMaxClauseCount(2500);

        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        try {
            infoSearcher = GetReader.getIndexSearcher(infoIndexName, infoLoad);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return this;
    }

    @Override
    public void close() {
        // TODO Auto-generated method stub

    }

    public String[] getAlternateNames(String id) {
        HashSet<String> names = null;
        Query query = new TermQuery(new Term("ID", id));
        try {
            TopDocs topDocs = infoSearcher.search(query, 2500);
            names = new HashSet<String>(topDocs.totalHits);
            for (ScoreDoc doc : topDocs.scoreDocs) {
                names.add(stringSearcher.doc(doc.doc).get("LOWERED_ORIGIN"));
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return names.toArray(new String[names.size()]);
    }

    @Override
    public Document getDocumentsById(String id) {
        if (id == null || id.length() == 0)
            throw new NullPointerException();
        TermQuery query = new TermQuery(new Term("ID", id));
        TopDocs res = null;
        try {
            res = infoSearcher.search(query, 1);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        if (res == null)
            return null;
        if (res.totalHits == 0)
            return null;
        try {
            return infoSearcher.doc(res.scoreDocs[0].doc);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return null;

    }

    public static void main(String argv[]) throws IOException {
        GlobalParam.setGazIndex("/users/indri/GazIndex");
        // GlobalParam.setGeoNames("GeoNames");
        CollaborativeIndex ci = ResourceFactory.getClbIndex();
        System.out.println(ci.getDocumentsByPhrase("newport").size());
        for (Document doc : ci.getDocumentsByPhrase("San Francisco"))

            System.out.println(doc.get(InfoFields.name) + " " + doc.get(InfoFields.countryCode) + " "
                    + doc.get(InfoFields.adm1Code) + "\t" + doc.get(InfoFields.adm2Code) + "\t"
                    + doc.get(InfoFields.alternativeNamesCount) + "\t" + doc.get(InfoFields.population) + "\t"
                    + doc.get(InfoFields.id) + "\t" + doc.get(InfoFields.latitude) + "\t"
                    + doc.get(InfoFields.longitude));
        if (true)
            return;
        boolean mode = true; // string
        // mode = false; // id
        /**
         * search string //id
         */

        BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
        String s = null;
        while ((s = br.readLine()) != null) {
            if (mode) {
                if (ResourceFactory.getClbIndex().inIndex(s))
                    for (Document d : ResourceFactory.getClbIndex().getDocumentsByPhrase(s))
                        System.out.println(d);
                else
                    System.out.println("null.");
            } else {
                Document doc = ci.getDocumentsById(s);
                System.out.println(doc);
                CandidateAndFeature gc = new CandidateAndFeature(s, doc, new LocEntityAnnotation(0, 0, s, null));
                System.out.println(gc.getId() + " " + gc.getAsciiName() + " " + gc.getCountryCode());
            }
        }
    }
}