edu.gslis.ts.RunQuery.java Source code

Introduction

Here is the source code for edu.gslis.ts.RunQuery.java
Source

package edu.gslis.ts;
/*******************************************************************************
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.TreeMap;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Options;
import org.apache.commons.collections4.Bag;
import org.apache.commons.collections4.bag.HashBag;
import org.apache.commons.io.FileUtils;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.transport.TIOStreamTransport;
import org.apache.thrift.transport.TTransport;
import org.apache.thrift.transport.TTransportException;
import org.tukaani.xz.XZInputStream;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import streamcorpus_v3.StreamItem;
import edu.gslis.textrepresentation.FeatureVector;
import edu.gslis.utils.Stopper;
import gov.loc.repository.pairtree.Pairtree;

/**
 * Simple 
 */
public class RunQuery {
    final static double MU = 2500;

    public static void main(String[] args) {
        try {
            // Get the commandline options
            Options options = createOptions();
            CommandLineParser parser = new GnuParser();
            CommandLine cmd = parser.parse(options, args);

            String inputPath = cmd.getOptionValue("input");
            String eventsPath = cmd.getOptionValue("events");
            String stopPath = cmd.getOptionValue("stop");
            int queryId = Integer.valueOf(cmd.getOptionValue("query"));

            List<String> ids = FileUtils.readLines(new File(inputPath + File.separator + "ids.txt"));

            Stopper stopper = new Stopper(stopPath);
            Map<Integer, FeatureVector> queries = readEvents(eventsPath, stopper);

            FeatureVector query = queries.get(queryId);

            Pairtree ptree = new Pairtree();
            Bag<String> words = new HashBag<String>();

            for (String streamId : ids) {

                String ppath = ptree.mapToPPath(streamId.replace("-", ""));

                String inpath = inputPath + File.separator + ppath + File.separator + streamId + ".xz";
                //                System.out.println(inpath);
                File infile = new File(inpath);
                InputStream in = new XZInputStream(new FileInputStream(infile));

                TTransport inTransport = new TIOStreamTransport(new BufferedInputStream(in));
                TBinaryProtocol inProtocol = new TBinaryProtocol(inTransport);
                inTransport.open();
                final StreamItem item = new StreamItem();

                while (true) {
                    try {
                        item.read(inProtocol);
                        //                        System.out.println("Read " + item.stream_id);

                    } catch (TTransportException tte) {
                        // END_OF_FILE is used to indicate EOF and is not an exception.
                        if (tte.getType() != TTransportException.END_OF_FILE)
                            tte.printStackTrace();
                        break;
                    }
                }

                // Do something with this document...
                String docText = item.getBody().getClean_visible();

                StringTokenizer itr = new StringTokenizer(docText);
                while (itr.hasMoreTokens()) {
                    words.add(itr.nextToken());
                }

                inTransport.close();

            }

            for (String term : words.uniqueSet()) {
                System.out.println(term + ":" + words.getCount(term));
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static Options createOptions() {
        Options options = new Options();
        options.addOption("input", true, "Input chunk file");
        options.addOption("events", true, "Events file");
        options.addOption("stop", true, "Stoplist");
        options.addOption("query", true, "Query");
        return options;
    }

    public static Map<Integer, FeatureVector> readEvents(String path, Stopper stopper) {
        Map<Integer, FeatureVector> queries = new TreeMap<Integer, FeatureVector>();

        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();

            Document doc = db.parse(new File(path));

            NodeList events = doc.getDocumentElement().getElementsByTagName("event");
            for (int i = 0; i < events.getLength(); i++) {
                Node event = events.item(i);
                NodeList elements = event.getChildNodes();
                int id = -1;
                String query = "";

                for (int j = 0; j < elements.getLength(); j++) {
                    Node element = elements.item(j);
                    if (element == null)
                        continue;

                    if (element.getNodeName().equals("id"))
                        id = Integer.parseInt(element.getTextContent());
                    else if (element.getNodeName().equals("query"))
                        query = element.getTextContent();
                }
                queries.put(id, new FeatureVector(query, stopper));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return queries;
    }
}