io.anserini.index.UserPostFrequencyDistribution.java Source code

Java tutorial

Introduction

Here is the source code for io.anserini.index.UserPostFrequencyDistribution.java

Source

/**
 * Twitter Tools
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.index;

import io.anserini.document.twitter.JsonStatusCorpusReader;
import io.anserini.document.twitter.Status;
import io.anserini.document.twitter.StatusStream;
import io.anserini.index.twitter.TweetAnalyzer;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
//import twitter4j.Status;
import twitter4j.json.DataObjectFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.tools.bzip2.CBZip2InputStream;

/**
 * Reference implementation for indexing statuses.
 */
public class UserPostFrequencyDistribution {
    private static final Logger LOG = LogManager.getLogger(UserPostFrequencyDistribution.class);

    public static final Analyzer ANALYZER = new TweetAnalyzer();
    public static String corpusFormat = null;
    static int userIndexedCount = 0;
    static DirectoryReader reader;
    static IndexSearcher searcher;

    private UserPostFrequencyDistribution() {
    }

    public static enum StatusField {
        ID("id"), USER_ID_STRING("user_id_string"), SCREEN_NAME("screen_name"), EPOCH("epoch"), TEXT("text"), LANG(
                "lang"), IN_REPLY_TO_STATUS_ID("in_reply_to_status_id"), IN_REPLY_TO_USER_ID(
                        "in_reply_to_user_id"), FOLLOWERS_COUNT("followers_count"), FRIENDS_COUNT(
                                "friends_count"), STATUSES_COUNT("statuses_count"), RETWEETED_STATUS_ID(
                                        "retweeted_status_id"), RETWEETED_USER_ID(
                                                "retweeted_user_id"), RETWEET_COUNT("retweet_count"), LATITUDE(
                                                        "latitude"), LONGITUDE("longitude"), PLACE(
                                                                "place"), PREVIOUS_PITTSBURGH_POST(
                                                                        "previous_pittsburgh_post"), POST_COUNT_TOTAL(
                                                                                "post_count_total");

        public final String name;

        StatusField(String s) {
            name = s;
        }
    };

    static double pittsburghLongitude = -79.976389d;
    static double pittsburghLatitude = 40.439722d;

    private static final String HELP_OPTION = "h";
    private static final String COLLECTION_OPTION = "collection";

    private static final String STORE_TERM_VECTORS_OPTION = "store";

    @SuppressWarnings("static-access")
    public static void main(String[] args) throws Exception {
        Options options = new Options();

        options.addOption(new Option(HELP_OPTION, "show help"));

        options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

        options.addOption(OptionBuilder.withArgName("collection").hasArg()
                .withDescription("source collection directory").create(COLLECTION_OPTION));
        options.addOption(OptionBuilder.withArgName("property").hasArg()
                .withDescription("source collection directory").create("property"));
        options.addOption(OptionBuilder.withArgName("collection_pattern").hasArg()
                .withDescription("source collection directory").create("collection_pattern"));

        CommandLine cmdline = null;
        CommandLineParser parser = new GnuParser();
        try {
            cmdline = parser.parse(options, args);
        } catch (ParseException exp) {
            System.err.println("Error parsing command line: " + exp.getMessage());
            System.exit(-1);
        }

        if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION)) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(UserPostFrequencyDistribution.class.getName(), options);
            System.exit(-1);
        }

        String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);

        final FieldType textOptions = new FieldType();
        textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
        textOptions.setStored(true);
        textOptions.setTokenized(true);
        textOptions.setStoreTermVectors(true);

        LOG.info("collection: " + collectionPath);
        LOG.info("collection_pattern " + cmdline.getOptionValue("collection_pattern"));
        LOG.info("property " + cmdline.getOptionValue("property"));
        LongOpenHashSet deletes = null;

        long startTime = System.currentTimeMillis();
        File file = new File(collectionPath);
        if (!file.exists()) {
            System.err.println("Error: " + file + " does not exist!");
            System.exit(-1);
        }

        final JsonStatusCorpusReader stream = new JsonStatusCorpusReader(file,
                cmdline.getOptionValue("collection_pattern"));

        Runtime.getRuntime().addShutdownHook(new Thread() {
            public void run() {

                try {

                    stream.close();
                } catch (IOException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                }
                ;

                System.out.println("# of users indexed this round: " + userIndexedCount);

                System.out.println("Shutting down");

            }
        });
        Status status;
        boolean readerNotInitialized = true;

        try {
            Properties prop = new Properties();
            while ((status = stream.next()) != null) {

                // try{
                // status = DataObjectFactory.createStatus(s);
                // if (status==null||status.getText() == null) {
                // continue;
                // }}catch(Exception e){
                //
                // }
                //

                boolean pittsburghRelated = false;
                try {

                    if (Math.abs(status.getLongitude() - pittsburghLongitude) < 0.05d
                            && Math.abs(status.getlatitude() - pittsburghLatitude) < 0.05d)
                        pittsburghRelated = true;
                } catch (Exception e) {

                }
                try {
                    if (status.getPlace().contains("Pittsburgh, PA"))
                        pittsburghRelated = true;
                } catch (Exception e) {

                }
                try {
                    if (status.getUserLocation().contains("Pittsburgh, PA"))
                        pittsburghRelated = true;
                } catch (Exception e) {

                }

                try {
                    if (status.getText().contains("Pittsburgh"))
                        pittsburghRelated = true;
                } catch (Exception e) {

                }

                if (pittsburghRelated) {

                    int previousPostCount = 0;

                    if (prop.containsKey(String.valueOf(status.getUserid()))) {
                        previousPostCount = Integer
                                .valueOf(prop.getProperty(String.valueOf(status.getUserid())).split(" ")[1]);
                    }

                    prop.setProperty(String.valueOf(status.getUserid()),
                            String.valueOf(status.getStatusesCount()) + " " + (1 + previousPostCount));
                    if (prop.size() > 0 && prop.size() % 1000 == 0) {
                        Runtime runtime = Runtime.getRuntime();
                        runtime.gc();
                        System.out.println("Property size " + prop.size() + "Memory used:  "
                                + ((runtime.totalMemory() - runtime.freeMemory()) / (1024L * 1024L)) + " MB\n");
                    }
                    OutputStream output = new FileOutputStream(cmdline.getOptionValue("property"), false);
                    prop.store(output, null);
                    output.close();

                }
            }
            //         prop.store(output, null);
            LOG.info(String.format("Total of %s statuses added", userIndexedCount));
            LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
        } catch (Exception e) {
            e.printStackTrace();
        } finally {

            stream.close();
        }
    }
}