com.twentyn.patentScorer.ScoreMerger.java Source code

Java tutorial

Introduction

Here is the source code for com.twentyn.patentScorer.ScoreMerger.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentScorer;

import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationConfig;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.twentyn.patentSearch.DocumentSearch;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FilenameFilter;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

public class ScoreMerger {
    public static final Logger LOGGER = LogManager.getLogger(ScoreMerger.class);

    public static void main(String[] args) throws Exception {
        System.out.println("Starting up...");
        System.out.flush();
        Options opts = new Options();
        opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build());

        opts.addOption(Option.builder("r").longOpt("results").required().hasArg()
                .desc("A directory of search results to read").build());
        opts.addOption(Option.builder("s").longOpt("scores").required().hasArg()
                .desc("A directory of patent classification scores to read").build());
        opts.addOption(Option.builder("o").longOpt("output").required().hasArg()
                .desc("The output file where results will be written.").build());

        HelpFormatter helpFormatter = new HelpFormatter();
        CommandLineParser cmdLineParser = new DefaultParser();
        CommandLine cmdLine = null;
        try {
            cmdLine = cmdLineParser.parse(opts, args);
        } catch (ParseException e) {
            System.out.println("Caught exception when parsing command line: " + e.getMessage());
            helpFormatter.printHelp("DocumentIndexer", opts);
            System.exit(1);
        }

        if (cmdLine.hasOption("help")) {
            helpFormatter.printHelp("DocumentIndexer", opts);
            System.exit(0);
        }
        File scoresDirectory = new File(cmdLine.getOptionValue("scores"));
        if (cmdLine.getOptionValue("scores") == null || !scoresDirectory.isDirectory()) {
            LOGGER.error("Not a directory of score files: " + cmdLine.getOptionValue("scores"));
        }

        File resultsDirectory = new File(cmdLine.getOptionValue("results"));
        if (cmdLine.getOptionValue("results") == null || !resultsDirectory.isDirectory()) {
            LOGGER.error("Not a directory of results files: " + cmdLine.getOptionValue("results"));
        }

        FileWriter outputWriter = new FileWriter(cmdLine.getOptionValue("output"));

        ObjectMapper objectMapper = new ObjectMapper();
        objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
        objectMapper.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);

        FilenameFilter jsonFilter = new FilenameFilter() {
            public final Pattern JSON_PATTERN = Pattern.compile("\\.json$");

            public boolean accept(File dir, String name) {
                return JSON_PATTERN.matcher(name).find();
            }
        };

        Map<String, PatentScorer.ClassificationResult> scores = new HashMap<>();
        LOGGER.info("Reading scores from directory at " + scoresDirectory.getAbsolutePath());
        for (File scoreFile : scoresDirectory.listFiles(jsonFilter)) {
            BufferedReader reader = new BufferedReader(new FileReader(scoreFile));
            int count = 0;
            String line;
            while ((line = reader.readLine()) != null) {
                PatentScorer.ClassificationResult res = objectMapper.readValue(line,
                        PatentScorer.ClassificationResult.class);
                scores.put(res.docId, res);
                count++;
            }
            LOGGER.info("Read " + count + " scores from " + scoreFile.getAbsolutePath());
        }

        Map<String, List<DocumentSearch.SearchResult>> synonymsToResults = new HashMap<>();
        Map<String, List<DocumentSearch.SearchResult>> inchisToResults = new HashMap<>();
        LOGGER.info("Reading results from directory at " + resultsDirectory);
        // With help from http://stackoverflow.com/questions/6846244/jackson-and-generic-type-reference.
        JavaType resultsType = objectMapper.getTypeFactory().constructCollectionType(List.class,
                DocumentSearch.SearchResult.class);

        List<File> resultsFiles = Arrays.asList(resultsDirectory.listFiles(jsonFilter));
        Collections.sort(resultsFiles, new Comparator<File>() {
            @Override
            public int compare(File o1, File o2) {
                return o1.getName().compareTo(o2.getName());
            }
        });
        for (File resultsFile : resultsFiles) {
            BufferedReader reader = new BufferedReader(new FileReader(resultsFile));
            CharBuffer buffer = CharBuffer.allocate(Long.valueOf(resultsFile.length()).intValue());
            int bytesRead = reader.read(buffer);
            LOGGER.info("Read " + bytesRead + " bytes from " + resultsFile.getName() + " (length is "
                    + resultsFile.length() + ")");
            List<DocumentSearch.SearchResult> results = objectMapper.readValue(new CharArrayReader(buffer.array()),
                    resultsType);

            LOGGER.info("Read " + results.size() + " results from " + resultsFile.getAbsolutePath());

            int count = 0;
            for (DocumentSearch.SearchResult sres : results) {
                for (DocumentSearch.ResultDocument resDoc : sres.getResults()) {
                    String docId = resDoc.getDocId();
                    PatentScorer.ClassificationResult classificationResult = scores.get(docId);
                    if (classificationResult == null) {
                        LOGGER.warn("No classification result found for " + docId);
                    } else {
                        resDoc.setClassifierScore(classificationResult.getScore());
                    }
                }
                if (!synonymsToResults.containsKey(sres.getSynonym())) {
                    synonymsToResults.put(sres.getSynonym(), new ArrayList<DocumentSearch.SearchResult>());
                }
                synonymsToResults.get(sres.getSynonym()).add(sres);
                count++;
                if (count % 1000 == 0) {
                    LOGGER.info("Processed " + count + " search result documents");
                }
            }
        }

        Comparator<DocumentSearch.ResultDocument> resultDocumentComparator = new Comparator<DocumentSearch.ResultDocument>() {
            @Override
            public int compare(DocumentSearch.ResultDocument o1, DocumentSearch.ResultDocument o2) {
                int cmp = o2.getClassifierScore().compareTo(o1.getClassifierScore());
                if (cmp != 0) {
                    return cmp;
                }
                cmp = o2.getScore().compareTo(o1.getScore());
                return cmp;
            }
        };

        for (Map.Entry<String, List<DocumentSearch.SearchResult>> entry : synonymsToResults.entrySet()) {
            DocumentSearch.SearchResult newSearchRes = null;
            // Merge all result documents into a single search result.
            for (DocumentSearch.SearchResult sr : entry.getValue()) {
                if (newSearchRes == null) {
                    newSearchRes = sr;
                } else {
                    newSearchRes.getResults().addAll(sr.getResults());
                }
            }
            if (newSearchRes == null || newSearchRes.getResults() == null) {
                LOGGER.error("Search results for " + entry.getKey() + " are null.");
                continue;
            }
            Collections.sort(newSearchRes.getResults(), resultDocumentComparator);
            if (!inchisToResults.containsKey(newSearchRes.getInchi())) {
                inchisToResults.put(newSearchRes.getInchi(), new ArrayList<DocumentSearch.SearchResult>());
            }
            inchisToResults.get(newSearchRes.getInchi()).add(newSearchRes);
        }

        List<String> sortedKeys = new ArrayList<String>(inchisToResults.keySet());
        Collections.sort(sortedKeys);
        List<GroupedInchiResults> orderedResults = new ArrayList<>(sortedKeys.size());
        Comparator<DocumentSearch.SearchResult> synonymSorter = new Comparator<DocumentSearch.SearchResult>() {
            @Override
            public int compare(DocumentSearch.SearchResult o1, DocumentSearch.SearchResult o2) {
                return o1.getSynonym().compareTo(o2.getSynonym());
            }
        };
        for (String inchi : sortedKeys) {
            List<DocumentSearch.SearchResult> res = inchisToResults.get(inchi);
            Collections.sort(res, synonymSorter);
            orderedResults.add(new GroupedInchiResults(inchi, res));
        }

        objectMapper.writerWithView(Object.class).writeValue(outputWriter, orderedResults);
        outputWriter.close();
    }

    public static class GroupedInchiResults {
        @JsonProperty("inchi")
        protected String inchi;
        @JsonProperty("search_results")
        protected List<DocumentSearch.SearchResult> results;

        protected GroupedInchiResults() {
        }

        public GroupedInchiResults(String inchi, List<DocumentSearch.SearchResult> results) {
            this.inchi = inchi;
            this.results = results;
        }

        public String getInchi() {
            return inchi;
        }

        public List<DocumentSearch.SearchResult> getResults() {
            return results;
        }
    }
}