Example usage for org.apache.hadoop.io Stringifier toString

List of usage examples for org.apache.hadoop.io Stringifier toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Stringifier toString.

Prototype

public String toString(T obj) throws IOException;

Source Link

Document

Converts the object to a string representation

Usage

From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorDriver.java

License:Apache License

/**
 * Run the job/*from  w  ww  . j av a 2  s .c o  m*/
 * 
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of simply containing the
 *          category string
 */
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly,
        Class<? extends Analyzer> analyzerClass)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("key.value.separator.in.input.line", " ");
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.set("analyzer.class", analyzerClass.getName());
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters can make or break a piece of code

    Set<String> categories = Sets.newHashSet();
    for (String line : new FileLineIterable(new File(catFile))) {
        categories.add(line.trim().toLowerCase(Locale.ENGLISH));
    }

    Stringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
            GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    Job job = new Job(conf);
    log.info("Input: {} Out: {} Categories: {}", input, output, catFile);
    job.setJarByClass(WikipediaDatasetCreatorDriver.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WikipediaDatasetCreatorMapper.class);
    //TODO: job.setNumMapTasks(100);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(WikipediaDatasetCreatorReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    HadoopUtil.delete(conf, outPath);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mahout.text.WikipediaToSequenceFile.java

License:Apache License

/**
 * Run the job/*from   w  ww . ja  va 2s.  c o  m*/
 * 
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of simply containing the
 *          category string
 * @param all
 *          if true select all categories
 * @param removeLabels
 *          if true remove Category labels from document text after extracting.
 *
 */
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all,
        boolean removeLabels) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    conf.setBoolean("remove.labels", removeLabels);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    Set<String> categories = Sets.newHashSet();
    if (!catFile.isEmpty()) {
        for (String line : new FileLineIterable(new File(catFile))) {
            categories.add(line.trim().toLowerCase(Locale.ENGLISH));
        }
    }

    Stringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
            GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);
    conf.set("wikipedia.categories", categoriesStr);

    Job job = new Job(conf);
    log.info("Input: {} Out: {} Categories: {} All Files: {}", input, output, catFile, all);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(WikipediaMapper.class);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setJarByClass(WikipediaToSequenceFile.class);

    /*
     * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
     */
    HadoopUtil.delete(conf, outPath);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

}

From source file:org.warcbase.mapreduce.lib.Chain.java

License:Apache License

protected static void setMapperConf(Configuration jobConf, Class<?> inputKeyClass, Class<?> inputValueClass,
        Class<?> outputKeyClass, Class<?> outputValueClass, Configuration mapperConf, int index,
        String prefix) {/*  w  w w .  j a va2 s .c  om*/
    // if the Mapper does not have a configuration, create an empty one
    if (mapperConf == null) {
        // using a Configuration without defaults to make it lightweight.
        // still the chain's conf may have all defaults and this conf is
        // overlapped to the chain configuration one.
        mapperConf = new Configuration(true);
    }

    // store the input/output classes of the mapper in the mapper conf
    mapperConf.setClass(MAPPER_INPUT_KEY_CLASS, inputKeyClass, Object.class);
    mapperConf.setClass(MAPPER_INPUT_VALUE_CLASS, inputValueClass, Object.class);
    mapperConf.setClass(MAPPER_OUTPUT_KEY_CLASS, outputKeyClass, Object.class);
    mapperConf.setClass(MAPPER_OUTPUT_VALUE_CLASS, outputValueClass, Object.class);
    // serialize the mapper configuration in the chain configuration.
    Stringifier<Configuration> stringifier = new DefaultStringifier<Configuration>(jobConf,
            Configuration.class);
    try {
        jobConf.set(prefix + CHAIN_MAPPER_CONFIG + index, stringifier.toString(new Configuration(mapperConf)));
    } catch (IOException ioEx) {
        throw new RuntimeException(ioEx);
    }

    // increment the chain counter
    jobConf.setInt(prefix + CHAIN_MAPPER_SIZE, index + 1);
}