jobs.ComputeNCITdistribution.java Source code

Introduction

Here is the source code for jobs.ComputeNCITdistribution.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package jobs;

import com.google.common.base.Stopwatch;

import java.util.List;
import java.util.concurrent.TimeUnit;
import models.MorphiaOntologyTerm;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import play.Logger;
import play.jobs.Job;
import play.vfs.VirtualFile;
import utils.CustomStopWordsStandardAnalyzer;
import utils.Utils;

/**
 * TO KEEP
 * Iterates over the concepts of the NCIT and searches for the number of
 * documents indexed. Important in order to estimate how often curated words are present
 * in the wild.
 * @author loopasam
 */
public class ComputeNCITdistribution extends Job {

    @Override
    public void doJob() throws Exception {

        Logger.info("Job started...");
        Stopwatch stopwatch = Stopwatch.createUnstarted();
        stopwatch.start();

        List<MorphiaOntologyTerm> terms = MorphiaOntologyTerm.findAll();

        int total = terms.size();
        int counter = 0;
        Directory directory = FSDirectory.open(VirtualFile.fromRelativePath("/indexes/index-2013").getRealFile());
        DirectoryReader ireader = DirectoryReader.open(directory);

        //Just chunck the words - no stop word removal - such concept will not give any result in principle
        Analyzer analyzer = new CustomStopWordsStandardAnalyzer(Version.LUCENE_47);
        IndexSearcher isearcher = new IndexSearcher(ireader);
        QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", analyzer);

        for (MorphiaOntologyTerm ontologyTerm : terms) {
            counter++;

            Logger.info("i: " + counter + "/" + total);
            Stopwatch timeQuery = Stopwatch.createUnstarted();
            timeQuery.start();
            Query query = parser.parse("\"" + ontologyTerm.value + "\"");
            ScoreDoc[] hits = isearcher.search(query, null, 100000000).scoreDocs;
            timeQuery.stop();
            Logger.info("Query time: " + timeQuery.elapsed(TimeUnit.MILLISECONDS));

            Stopwatch timeUpdate = Stopwatch.createUnstarted();
            timeUpdate.start();
            ontologyTerm.frequency = hits.length;
            ontologyTerm.save();
            timeUpdate.stop();
            Logger.info("Update time: " + timeUpdate.elapsed(TimeUnit.MILLISECONDS));
            Logger.info("Query: " + ontologyTerm.value + " - " + hits.length);
        }

        stopwatch.stop();
        Utils.emailAdmin("Distribution completed",
                "Job finished in " + stopwatch.elapsed(TimeUnit.MINUTES) + " minutes.");
        Logger.info("Job finished");

    }

}