edu.byu.nlp.data.app.DataExporter.java Source code

Introduction

Here is the source code for edu.byu.nlp.data.app.DataExporter.java
Source

/**
 * Copyright 2012 Brigham Young University
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.byu.nlp.data.app;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.PrintWriter;

import org.apache.commons.math3.random.MersenneTwister;
import org.apache.commons.math3.random.RandomGenerator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;
import com.google.common.collect.Iterables;

import edu.byu.nlp.data.docs.DocPipes;
import edu.byu.nlp.data.docs.DocumentDatasetBuilder;
import edu.byu.nlp.data.docs.TopNPerDocumentFeatureSelectorFactory;
import edu.byu.nlp.data.streams.EmailHeaderStripper;
import edu.byu.nlp.data.types.Dataset;
import edu.byu.nlp.data.types.DatasetInstance;
import edu.byu.nlp.data.types.SparseFeatureVector.Entry;
import edu.byu.nlp.io.Files2;
import edu.byu.nlp.io.Writers;
import edu.byu.nlp.util.jargparser.ArgumentParser;
import edu.byu.nlp.util.jargparser.annotations.Option;

/**
 * @author rah67
 *
 */
public class DataExporter {

    private static final Logger logger = LoggerFactory.getLogger(DataExporter.class);

    // TODO : share options with ClustererEvaluator
    @Option(help = "base directory of the documents")
    private static String basedir = "20_newsgroups";

    @Option
    private static String dataset = "reduced_set";

    @Option
    private static String split = "all";

    @Option
    private static int minFeaturesToKeepPerDocument = 10;

    // TODO : share code with ClustererEvaluator
    private static Dataset readData(RandomGenerator rnd) throws IOException {
        Function<String, String> tokenTransform = null; // TODO
        Integer featureNormalizationConstant = null;
        Dataset data = new DocumentDatasetBuilder(basedir, dataset, split, new EmailHeaderStripper(),
                DocPipes.opennlpSentenceSplitter(), DocPipes.McCallumAndNigamTokenizer(), tokenTransform,
                new TopNPerDocumentFeatureSelectorFactory(minFeaturesToKeepPerDocument),
                featureNormalizationConstant).dataset();

        // Print for verification
        // new StandardOutSink<Integer, SparseFeatureVector>().process(pipeAndData.getOutput());
        logger.info("Number of instances = " + data.getInfo().getNumDocuments());
        logger.info("Number of tokens = " + data.getInfo().getNumTokens());
        logger.info("Number of features = " + data.getInfo().getNumFeatures());
        logger.info("Number of classes = " + data.getInfo().getNumClasses());

        data.shuffle(rnd);
        return data;
    }

    public static class Instance2SVMLitePlus implements Function<DatasetInstance, String> {

        @Override
        public String apply(DatasetInstance instance) {
            StringBuilder sb = new StringBuilder();
            sb.append(instance.getInfo().getRawSource());
            sb.append(' ');
            sb.append(instance.getObservedLabel());
            for (Entry entry : instance.asFeatureVector().sparseEntries()) {
                sb.append(' ');
                sb.append(entry.getIndex());
                sb.append(":");
                sb.append(entry.getValue());
            }
            return sb.toString();
        }

    }

    public static void main(String[] args) throws IOException {
        args = new ArgumentParser(DataExporter.class).parseArgs(args).getPositionalArgs();

        RandomGenerator rnd = new MersenneTwister();
        Dataset dataset = readData(rnd);

        Iterable<String> it = Iterables.transform(dataset, new Instance2SVMLitePlus());
        if (args.length < 1) {
            Writers.writeLines(new PrintWriter(new BufferedOutputStream(System.out)), it);
        } else {
            Files2.writeLines(it, args[0]);
        }
    }

}