Example usage for org.apache.lucene.index IndexReader close

List of usage examples for org.apache.lucene.index IndexReader close

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader close.

Prototype

@Override
public final synchronized void close() throws IOException 

Source Link

Document

Closes files associated with this index.

Usage

From source file:eyeskyhigh.lucene.demo.SearchFiles.java

License:Apache License

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
    String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field] [-paging hitsPerPage]";
    usage += "\n\tSpecify 'false' for hitsPerPage to use streaming instead of paging search.";
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
        System.out.println(usage);
        System.exit(0);//w w  w  .ja v  a 2  s  . c  o  m
    }

    String index = "index";
    String field = "contents";
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String normsField = null;
    boolean paging = true;
    int hitsPerPage = 10;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            index = args[i + 1];
            i++;
        } else if ("-field".equals(args[i])) {
            field = args[i + 1];
            i++;
        } else if ("-queries".equals(args[i])) {
            queries = args[i + 1];
            i++;
        } else if ("-repeat".equals(args[i])) {
            repeat = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-raw".equals(args[i])) {
            raw = true;
        } else if ("-norms".equals(args[i])) {
            normsField = args[i + 1];
            i++;
        } else if ("-paging".equals(args[i])) {
            if (args[i + 1].equals("false")) {
                paging = false;
            } else {
                hitsPerPage = Integer.parseInt(args[i + 1]);
                if (hitsPerPage == 0) {
                    paging = false;
                }
            }
            i++;
        }
    }
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriter writer = new IndexWriter(index, analyzer, IndexWriter.MaxFieldLength.LIMITED);
    IndexReader reader = IndexReader.open(index);

    if (normsField != null)
        reader = new OneNormsReader(reader, normsField);

    Searcher searcher = new IndexSearcher(reader);

    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new FileReader(queries));
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
    //      QueryParser parser = new QueryParser(field, analyzer);
    //      parser.setAllowLeadingWildcard(true);
    while (true) {
        if (queries == null) // prompt the user
            System.out.println("Enter query: ");

        String line1 = in.readLine();//, line2 = in.readLine();

        if (line1 == null || line1.length() == -1)
            break;

        line1 = line1.trim();
        if (line1.length() == 0)
            break;

        Query query;
        //      query = parser.parse(QueryParser.escape(line1));
        //      System.out.println(QueryParser.escape(line));
        //      query = new TermQuery(new Term(field, line1));
        query = new BooleanQuery();
        //      ((BooleanQuery)query).add(new PrefixQuery(new Term(field, line1)), BooleanClause.Occur.SHOULD);
        //      ((BooleanQuery)query).add(new PrefixQuery(new Term(field, line2)), BooleanClause.Occur.SHOULD);
        ((BooleanQuery) query).add(new WildcardQuery(new Term(field, line1)), BooleanClause.Occur.SHOULD);
        //      ((BooleanQuery)query).add(new WildcardQuery(new Term(field, line2)), BooleanClause.Occur.SHOULD);
        //      query = new WildcardQuery(new Term(field, line1));
        System.out.println("Searching for: " + query.toString(field));

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                //          searcher.search(query, null, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        if (paging) {
            doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null);
        } else {
            doStreamingSearch(searcher, query);
        }
    }
    reader.close();
    writer.close();
}

From source file:fi.semantum.strategia.Lucene.java

License:Open Source License

public static synchronized List<String> search(String databaseId, String search) throws IOException {

    ArrayList<String> result = new ArrayList<String>();

    IndexReader reader = null;

    try {/*from   w w  w.j  a  v a2 s . com*/

        reader = DirectoryReader.open(getDirectory(databaseId));
        IndexSearcher searcher = new IndexSearcher(reader);

        QueryParser parser = new QueryParser(Version.LUCENE_4_9, "text", getAnalyzer());
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse(search);

        TopDocs docs = searcher.search(query, Integer.MAX_VALUE);

        for (ScoreDoc scoreDoc : docs.scoreDocs) {

            try {

                DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();

                reader.document(scoreDoc.doc, visitor);

                Document doc = visitor.getDocument();

                result.add(doc.get("uuid"));

            } catch (CorruptIndexException e) {
                throw new IOException(e);
            }

        }

    } catch (ParseException e) {

        throw new IOException(e);

    } finally {

        if (reader != null)
            reader.close();

    }

    return result;

}

From source file:FindIO.TextIndex.java

License:Apache License

public Map<String, double[]> searchText(String queryString) throws Exception {
    List<String> terms = Arrays.asList(queryString.trim().split(" "));

    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexFile));
    IndexSearcher searcher = new IndexSearcher(reader);
    // :Post-Release-Update-Version.LUCENE_XY:
    Analyzer analyzer = new StandardAnalyzer();

    BufferedReader in = null;/*from   ww  w  .ja v  a 2  s .c  o  m*/
    in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));

    // :Post-Release-Update-Version.LUCENE_XY:
    QueryParser parser = new QueryParser(fieldname1, analyzer);

    Query query = parser.parse(queryString);
    if (test)
        System.out.println("Searching for text: " + query.toString(fieldname1));

    TopDocs topDocs;
    if (test) { // repeat & time as benchmark
        long start = System.currentTimeMillis();
        topDocs = searcher.search(query, null, Common.topK);
        long end = System.currentTimeMillis();
        System.out.println("Time: " + (end - start) + " ms");
    } else {
        topDocs = searcher.search(query, null, Common.topK);
    }

    ScoreDoc[] hits = topDocs.scoreDocs;

    Map<String, double[]> mapResults = new HashMap<String, double[]>();
    //print out the top hits documents
    for (ScoreDoc hit : hits) {
        Document doc = searcher.doc(hit.doc);
        String tag = doc.get(fieldname1);
        int index = terms.indexOf(tag);
        if (index == -1) {
            continue;
        }
        String[] images = doc.get(fieldname2).split("\\s+");
        for (int i = 0; i < images.length; i += 2) {
            String imageName = images[i];
            String freq = images[i + 1];
            if (mapResults.get(imageName) == null) {
                mapResults.put(imageName, new double[terms.size()]);
            }
            double[] docTerms = mapResults.get(imageName);
            docTerms[index] = Double.parseDouble(freq);
        }
    }
    reader.close();

    return mapResults;
}

From source file:FindIO.TextIndex.java

License:Apache License

/**
 * update score mainly used for relevance feedback, the input should be stemmed
 * @param imageID/*from w w w .  ja  va 2  s  .c  om*/
 * @param tag_score_pairs
 * @throws Throwable
 */
public void updateScore(String imageID, List<FindIOPair> tag_score_pairs) throws Throwable {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexFile));
    IndexSearcher searcher = new IndexSearcher(reader);
    // :Post-Release-Update-Version.LUCENE_XY:
    Analyzer analyzer = new StandardAnalyzer();

    BufferedReader in = null;
    in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    // :Post-Release-Update-Version.LUCENE_XY:
    QueryParser parser = new QueryParser(fieldname1, analyzer);

    for (FindIOPair pair : tag_score_pairs) {
        String tag = pair.getID();
        double add_score = pair.getValue();

        Query query = parser.parse(tag);

        System.out.println("Updating Text: " + query.toString(fieldname1));

        TopDocs topDocs;
        if (test) { // repeat & time as benchmark
            long start = System.currentTimeMillis();
            topDocs = searcher.search(query, null, Common.topK);
            long end = System.currentTimeMillis();
            System.out.println("Time: " + (end - start) + " ms");
        } else {
            topDocs = searcher.search(query, null, Common.topK);
        }

        ScoreDoc[] hits = topDocs.scoreDocs;
        if (hits.length == 0) { //It's a new tag
            Document doc = new Document();
            String img_score = imageID + " " + (0.1 * add_score) + " ";
            if (add_score > 0) {
                // set fields for document
                this.tag_field.setStringValue(this.textAnalyzer.getStem(tag));
                this.img_field.setStringValue(img_score);
                doc.add(tag_field);
                doc.add(img_field);
                MMwriter.addDocument(doc);
            }
        } else {
            //The tag is included in the index
            int docId = hits[0].doc;

            //retrieve the old document
            Document doc = searcher.doc(docId);

            //replacement field value
            String currentScores = doc.get(fieldname2);
            String[] img_score_pairs = currentScores.split(" ");
            StringBuilder stringBuilder = new StringBuilder();

            boolean isImageContained = false;

            for (int i = 0; i < img_score_pairs.length; i += 2) {
                String img = img_score_pairs[i];
                double old_score = Double.valueOf(img_score_pairs[i + 1]);
                double new_score = old_score + add_score;
                if (new_score < 0) {
                    new_score = 0;
                }
                String img_score_pair;
                if (img.equals(imageID)) {
                    img_score_pair = img + " " + new_score + " ";
                    isImageContained = true;
                } else {
                    img_score_pair = img + " " + old_score + " ";
                }
                stringBuilder.append(img_score_pair);
            }

            if (!isImageContained) { //If the image was not covered by the tag, append it to the tail
                stringBuilder.append(imageID + " " + add_score + " ");
            }

            //remove all occurrences of the old field
            doc.removeFields(fieldname2);

            this.img_field.setStringValue(stringBuilder.toString().trim());
            if (test)
                System.out.println(stringBuilder.toString());
            //insert the replacement
            doc.add(img_field);
            Term tagTerm = new Term(this.fieldname1, tag);
            MMwriter.updateDocument(tagTerm, doc);
        }

        MMwriter.commit();
    }
    reader.close();
    closeWriter();
}

From source file:fr.ericlab.sondy.algo.eventdetection.ET.java

License:Open Source License

public static LinkedList<String> getFrequentBigrams(String tweets, HashSet<String> bigrams) {
    try {/*w  ww  .  ja  v  a2  s. c o m*/
        LinkedList<String> FCB = new LinkedList<String>();
        WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_36);
        RAMDirectory temporaryIndex = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter temporaryWriter = new IndexWriter(temporaryIndex, config);
        Document doc = new Document();
        doc.add(new Field("content", tweets, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
        temporaryWriter.addDocument(doc);
        temporaryWriter.commit();
        IndexReader temporaryReader = IndexReader.open(temporaryWriter, true);
        TermEnum allTerms = temporaryReader.terms();
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (bigrams.contains(term)) {
                FCB.add(term);
            }
        }
        temporaryWriter.close();
        temporaryReader.close();
        temporaryIndex.close();
        return FCB;
    } catch (LockObtainFailedException ex) {
        Logger.getLogger(ET.class.getName()).log(Level.SEVERE, null, ex);
    } catch (IOException ex) {
        Logger.getLogger(ET.class.getName()).log(Level.SEVERE, null, ex);
    }
    return new LinkedList<>();
}

From source file:fr.ericlab.sondy.algo.eventdetection.MABED.java

License:Open Source License

MABEDTopic getRefinedTopic(MABEDTopic simpleTopic, int nbrelatedTerms) {
    MABEDTopic refinedTopic = new MABEDTopic();
    String[] frequentTerms = new String[nbrelatedTerms];
    try {// w  w  w  .  jav a2 s  .  c om
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        RAMDirectory temporaryIndex = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter temporaryWriter = new IndexWriter(temporaryIndex, config);
        Document doc = new Document();
        doc.add(new Field("content",
                dbAccess.getMessagesAsString(appVariables, simpleTopic.mainTerm, simpleTopic.I.timeSliceA,
                        simpleTopic.I.timeSliceB),
                Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
        temporaryWriter.addDocument(doc);
        temporaryWriter.commit();
        IndexReader temporaryReader = IndexReader.open(temporaryWriter, true);
        TermEnum allTerms = temporaryReader.terms();
        int minFreq = 0;
        TermInfoList termList = new TermInfoList();
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (!term.equals(simpleTopic.mainTerm) && term.length() > 1 && !appVariables.isStopWord(term)) {
                int cf = IndexAccess.getTermOccurenceCount(temporaryReader, term);
                if (cf > minFreq) {
                    termList.addTermInfo(new TermInfo(term, (int) cf));
                    termList.sortList();
                    if (termList.size() > nbrelatedTerms) {
                        termList.removeLast();
                    }
                    minFreq = termList.get(termList.size() - 1).occurence;
                }
            }
        }
        for (int i = 0; i < termList.size() && i < nbrelatedTerms; i++) {
            frequentTerms[i] = termList.get(i).text;
        }
        temporaryWriter.close();
        temporaryReader.close();
        temporaryIndex.close();

        float ref[] = indexAccess.getTermFrequency(appVariables, simpleTopic.mainTerm);
        float comp[];
        refinedTopic = new MABEDTopic(simpleTopic.mainTerm, simpleTopic.I, simpleTopic.score,
                simpleTopic.anomaly);
        for (int j = 0; j < nbrelatedTerms && frequentTerms[j] != null; j++) {
            comp = indexAccess.getTermFrequency(appVariables, frequentTerms[j]);
            double w = getErdemCoefficient(ref, comp, simpleTopic.I.timeSliceA, simpleTopic.I.timeSliceB);
            if (w >= _THETA_) {
                refinedTopic.relatedTerms.add(new MABEDWeightedTerm(frequentTerms[j], w));
            }
        }
    } catch (IOException ex) {
        Logger.getLogger(MABED.class.getName()).log(Level.SEVERE, null, ex);
    }
    return refinedTopic;
}

From source file:fr.ericlab.sondy.core.DataManipulation.java

License:Open Source License

public String[] getFrequentCoocurringTerms(String document, int numTerms, String baseTerm,
        AppVariables appVariables) {/* ww  w.  j  ava 2  s .c  o  m*/
    String[] frequentTerms = new String[numTerms];
    try {
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        RAMDirectory index = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter w = new IndexWriter(index, config);
        Document doc = new Document();
        doc.add(new Field("content", document, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
        w.addDocument(doc);
        w.commit();
        IndexReader r = IndexReader.open(w, true);
        TermEnum allTerms = r.terms();
        int minFreq = 0;
        TermInfoList termList = new TermInfoList();
        StopWords stopWords = appVariables.currentStopWords;
        HashSet<String> stopWordsSet = stopWords.getSet();
        stopWords.add(baseTerm);
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !stopWordsSet.contains(term)) {
                float cf = getTermOccurenceCount(r, term);
                if (cf > minFreq) {
                    termList.addTermInfo(new TermInfo(term, (int) cf));
                    termList.sortList();
                    if (termList.size() > numTerms) {
                        termList.removeLast();
                    }
                    minFreq = termList.get(termList.size() - 1).occurence;
                }
            }
        }
        for (int i = 0; i < termList.size(); i++) {
            frequentTerms[i] = termList.get(i).text;
        }
        w.close();
        r.close();
        index.close();
    } catch (Exception ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    }
    return frequentTerms;
}

From source file:fr.ericlab.sondy.core.DataManipulation.java

License:Open Source License

public String[] getFrequentCoocurringTermsFromFile(int numTerms, String baseTerm, AppVariables appVariables) {
    String[] frequentTerms = new String[numTerms];
    try {/*from  www  . j  a va 2 s. c om*/
        BufferedReader input = new BufferedReader(new FileReader("tmp.msg"));
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        RAMDirectory index = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter w = new IndexWriter(index, config);
        String line = "";
        String document = "";
        int count = 0;
        while ((line = input.readLine()) != null) {
            count++;
            document += line;
            if (count == 2000) {
                Document doc = new Document();
                doc.add(new Field("content", document, Field.Store.NO, Field.Index.ANALYZED,
                        Field.TermVector.YES));
                w.addDocument(doc);
                w.commit();
                count = 0;
                document = "";
            }
        }
        Document doc = new Document();
        doc.add(new Field("content", document, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
        w.addDocument(doc);
        w.commit();
        input.close();
        IndexReader r = IndexReader.open(w, true);
        TermEnum allTerms = r.terms();
        int minFreq = 0;
        TermInfoList termList = new TermInfoList();
        StopWords stopWords = appVariables.currentStopWords;
        HashSet<String> stopWordsSet = stopWords.getSet();
        stopWords.add(baseTerm);
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !stopWordsSet.contains(term)) {
                float cf = getTermOccurenceCount(r, term);
                if (cf > minFreq) {
                    termList.addTermInfo(new TermInfo(term, (int) cf));
                    termList.sortList();
                    if (termList.size() > numTerms) {
                        termList.removeLast();
                    }
                    minFreq = termList.get(termList.size() - 1).occurence;
                }
            }
        }
        for (int i = 0; i < termList.size(); i++) {
            frequentTerms[i] = termList.get(i).text;
        }
        w.close();
        r.close();
        index.close();
    } catch (Exception ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    }
    return frequentTerms;
}

From source file:fr.lipn.yasemir.ontology.annotation.KNNAnnotator.java

License:Open Source License

public DocumentAnnotation annotate(String document) {
    DocumentAnnotation ret = new DocumentAnnotation();

    try {// w  w  w  .j  a  v  a  2s.  c o  m
        IndexReader reader = IndexReader.open(FSDirectory.open(new File(termIndexPath)));
        IndexSearcher searcher = new IndexSearcher(reader);

        document = document.replaceAll("Support, .+?;", "");
        document = document.replaceAll("\\[.*?\\]", "").trim();
        //document = document.replaceAll( "\\p{Punct}", " " );
        String[] fragments = document.split("[;:\\.,]");

        for (String ofragment : fragments) {
            ofragment = ofragment.replaceAll("\\p{Punct}", " ");
            ofragment = ofragment.trim();
            String sa[] = ofragment.split("(?<=[ \\n])");
            EnglishStemmer st = new EnglishStemmer();
            StringBuffer fbuf = new StringBuffer();
            for (String s : sa) {
                st.setCurrent(s.trim());
                st.stem();
                fbuf.append(st.getCurrent());
                fbuf.append(" ");
            }

            String fragment = fbuf.toString().trim(); //stemmed fragment

            if (fragment.length() == 0)
                continue;
            //System.err.println("Annotating: "+fragment);

            //use K-NN annotation (see Trieschnigg et al. 2009)
            IndexReader docreader = IndexReader.open(FSDirectory.open(new File(this.standardIndexPath)));
            IndexSearcher docsearcher = new IndexSearcher(docreader);

            QueryParser parser = new QueryParser(Version.LUCENE_44, "text", Yasemir.analyzer);
            Query query = parser.parse(fragment);
            System.err.println("Looking for: " + query);
            TopDocs results = docsearcher.search(query, N); //get the first 100 documents
            ScoreDoc[] hits = results.scoreDocs;

            int topLimit = Math.min(results.totalHits, K);
            int bottomLimit = Math.min(results.totalHits, N) - K;
            int numTotalHits = Math.min(results.totalHits, N);

            //System.err.println("top:"+topLimit+" bottom:"+bottomLimit+" total:"+numTotalHits);
            HashMap<String, Double> ttags = new HashMap<String, Double>();
            HashMap<String, Integer> btags = new HashMap<String, Integer>();
            if (topLimit < bottomLimit) {
                //Get the tags used in the top K documents matching the request
                hits = docsearcher.search(query, numTotalHits).scoreDocs;
                for (int i = 0; i < topLimit; i++) {
                    Document doc = docsearcher.doc(hits[i].doc);
                    Vector<String> tags = new Vector<String>();
                    List<IndexableField> docFields = doc.getFields();
                    for (IndexableField f : docFields) {
                        String fname = f.name();
                        if (fname.endsWith("annot")) {
                            tags.add(fname + ":" + doc.get(fname));
                        }
                    }

                    String[] tagStrings = (String[]) tags.toArray();
                    for (String t : tagStrings) {
                        t = t.replaceAll("\\W|_", " ");
                        Double nt = ttags.get(t);
                        if (nt == null)
                            nt = new Double(hits[i].score);
                        else
                            nt = new Double(hits[i].score + nt.doubleValue());
                        ttags.put(t, nt);
                    }
                }
                for (int i = bottomLimit; i < numTotalHits; i++) {
                    Document doc = docsearcher.doc(hits[i].doc);
                    Vector<String> tags = new Vector<String>();
                    List<IndexableField> docFields = doc.getFields();
                    for (IndexableField f : docFields) {
                        String fname = f.name();
                        if (fname.endsWith("annot")) {
                            tags.add(fname + ":" + doc.get(fname));
                        }
                    }

                    String[] tagStrings = (String[]) tags.toArray();
                    for (String t : tagStrings) {
                        t = t.replaceAll("\\W|_", " ");
                        Integer nt = btags.get(t);
                        if (nt == null)
                            nt = new Integer(1);
                        else
                            nt = new Integer((nt.intValue() + 1));
                        btags.put(t, nt);
                    }
                }

            }

            Vector<WeightedTag> tagv = new Vector<WeightedTag>();
            //now find, for all tags, the corresponding MeSH concepts
            double sum = 0;
            for (String tag : ttags.keySet()) {
                double tagStrength = ttags.get(tag).doubleValue();
                double compStrength = 0;
                if (btags.containsKey(tag)) {
                    compStrength = (btags.get(tag).doubleValue()) / ((double) K);
                }
                //System.err.println(tag+ " :str="+tagStrength+", comp="+compStrength);
                double weight = tagStrength * (1 - compStrength);
                sum += weight;
                tagv.add(new WeightedTag(tag, weight));
            }
            double avg = sum / (double) tagv.size();

            double ssum = 0;
            for (WeightedTag wt : tagv) {
                ssum += Math.sqrt(Math.pow(wt.getWeight() - avg, 2d));
            }
            double stddev = ssum / (double) tagv.size();

            //System.err.println("avg w: "+avg+" stddev:"+stddev+" limit:"+(avg+2*stddev));
            double limit = (avg + 2 * stddev); //definition of statistic outlier

            TagComparator comparator = new TagComparator();
            Collections.sort(tagv, comparator);

            int i = 0;
            for (WeightedTag wt : tagv) {
                String tag = wt.getName();
                if (i >= maxTags)
                    break;
                if (wt.getWeight() >= limit) {
                    QueryParser tagparser = new QueryParser(Version.LUCENE_44, "labels", Yasemir.analyzer);
                    Query tagquery = tagparser.parse("\"" + tag + "\"");

                    TopDocs tagresults = searcher.search(tagquery, 5);
                    ScoreDoc[] taghits = tagresults.scoreDocs;

                    int numTagTotalHits = tagresults.totalHits;

                    if (numTagTotalHits > 0) {
                        taghits = searcher.search(tagquery, numTagTotalHits).scoreDocs;
                        Document doc = searcher.doc(taghits[0].doc);

                        Annotation ann = new Annotation(doc.get("id"));
                        //System.err.println("Adding: "+tag+" w:"+wt.getWeight());
                        String ontoID = ann.getRelatedOntology().getOntologyID();

                        Vector<Annotation> annotations = ret.get(ontoID);
                        if (annotations == null)
                            annotations = new Vector<Annotation>();
                        annotations.add(ann);
                        ret.put(ontoID, annotations);

                        i++;
                    }
                }

            }
            docreader.close();

        }
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;

}

From source file:fr.lipn.yasemir.ontology.annotation.SentenceBasedAnnotator.java

License:Open Source License

/**
 * Implementation of the annotate method by IndexBasedAnnotator.
 * //from w w  w. j  ava2 s .c  o  m
 * The input text is splitted in fragments according to punctuation;
 * every fragment is used as a query and sent to a Lucene SE that
 * was used to index the terminology (BM25 weight).
 * Up to the 20 top results returned by the system are taken as the annotation for the
 * fragment text. All the fragment annotations combined compose the document annotation
 * that is returned by this method.
 * 
 */
public DocumentAnnotation annotate(String document) {
    DocumentAnnotation ret = new DocumentAnnotation();

    try {
        IndexReader reader = IndexReader.open(FSDirectory.open(new File(termIndexPath)));
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(new BM25Similarity());

        /*
        document=document.replaceAll("\\[.*?\\]", "").trim();
        //document = document.replaceAll( "\\p{Punct}", " " );
        String [] fragments = document.split("[;:\\.,]");
        */

        String[] fragments = (String[]) getSentences(document).toArray();

        for (String ofragment : fragments) {
            ofragment = ofragment.replaceAll("\\p{Punct}", " ");
            ofragment = ofragment.trim();
            String sa[] = ofragment.split("(?<=[ \\n])");
            EnglishStemmer st = new EnglishStemmer();
            StringBuffer fbuf = new StringBuffer();
            for (String s : sa) {
                st.setCurrent(s.trim());
                st.stem();
                fbuf.append(st.getCurrent());
                fbuf.append(" ");
            }

            String fragment = fbuf.toString().trim(); //stemmed fragment

            if (fragment.length() == 0)
                continue;
            //System.err.println("Annotating: "+fragment);

            QueryParser parser = new QueryParser(Version.LUCENE_44, "labels", Yasemir.analyzer);
            Query query = parser.parse(fragment);
            String stemmedFragment = query.toString("labels").replaceAll("labels:", "");

            TopDocs results = searcher.search(query, 20);
            ScoreDoc[] hits = results.scoreDocs;

            int numTotalHits = results.totalHits;
            //System.err.println(numTotalHits + " total matching classes");

            if (numTotalHits > 0) {
                hits = searcher.search(query, numTotalHits).scoreDocs;
                for (int i = 0; i < Math.min(numTotalHits, MAX_ANNOTS); i++) {
                    Document doc = searcher.doc(hits[i].doc);
                    String ptrn = "(?i)(" + doc.get("labels").replaceAll(", ", "|") + ")";
                    //System.err.println("OWLClass="+doc.get("id")+" score="+hits[i].score);
                    if (Tools.checkPattern(stemmedFragment, ptrn)) {
                        //System.err.println("OK: OWLClass="+doc.get("id")+" score="+hits[i].score);
                        Annotation ann = new Annotation(doc.get("id"));
                        String ontoID = ann.getRelatedOntology().getOntologyID();

                        Vector<Annotation> annotations = ret.get(ontoID);
                        if (annotations == null)
                            annotations = new Vector<Annotation>();
                        annotations.add(ann);
                        ret.put(ontoID, annotations);
                    }
                }
            }

        }
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;

}