Example usage for org.apache.hadoop.io Stringifier toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Stringifier toString.

Prototype

public String toString(T obj) throws IOException;

Source Link

Document

Converts the object to a string representation

Usage

From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorDriver.java

License:Apache License

/**
 * Run the job/*from  w  ww  . j av a 2  s .c o  m*/
 * 
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of simply containing the
 *          category string
 */
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly,
        Class<? extends Analyzer> analyzerClass)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("key.value.separator.in.input.line", " ");
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.set("analyzer.class", analyzerClass.getName());
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters can make or break a piece of code

    Set<String> categories = Sets.newHashSet();
    for (String line : new FileLineIterable(new File(catFile))) {
        categories.add(line.trim().toLowerCase(Locale.ENGLISH));
    }

    Stringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
            GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    Job job = new Job(conf);
    log.info("Input: {} Out: {} Categories: {}", input, output, catFile);
    job.setJarByClass(WikipediaDatasetCreatorDriver.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WikipediaDatasetCreatorMapper.class);
    //TODO: job.setNumMapTasks(100);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(WikipediaDatasetCreatorReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    HadoopUtil.delete(conf, outPath);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mahout.text.WikipediaToSequenceFile.java

License:Apache License

/**
 * Run the job/*from   w  ww . ja  va 2s.  c o  m*/
 * 
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of simply containing the
 *          category string
 * @param all
 *          if true select all categories
 * @param removeLabels
 *          if true remove Category labels from document text after extracting.
 *
 */
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all,
        boolean removeLabels) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    conf.setBoolean("remove.labels", removeLabels);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    Set<String> categories = Sets.newHashSet();
    if (!catFile.isEmpty()) {
        for (String line : new FileLineIterable(new File(catFile))) {
            categories.add(line.trim().toLowerCase(Locale.ENGLISH));
        }
    }

    Stringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
            GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);
    conf.set("wikipedia.categories", categoriesStr);

    Job job = new Job(conf);
    log.info("Input: {} Out: {} Categories: {} All Files: {}", input, output, catFile, all);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(WikipediaMapper.class);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setJarByClass(WikipediaToSequenceFile.class);

    /*
     * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
     */
    HadoopUtil.delete(conf, outPath);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

}

From source file:org.warcbase.mapreduce.lib.Chain.java

License:Apache License

protected static void setMapperConf(Configuration jobConf, Class<?> inputKeyClass, Class<?> inputValueClass,
        Class<?> outputKeyClass, Class<?> outputValueClass, Configuration mapperConf, int index,
        String prefix) {/*  w  w w .  j a va2 s .c  om*/
    // if the Mapper does not have a configuration, create an empty one
    if (mapperConf == null) {
        // using a Configuration without defaults to make it lightweight.
        // still the chain's conf may have all defaults and this conf is
        // overlapped to the chain configuration one.
        mapperConf = new Configuration(true);
    }

    // store the input/output classes of the mapper in the mapper conf
    mapperConf.setClass(MAPPER_INPUT_KEY_CLASS, inputKeyClass, Object.class);
    mapperConf.setClass(MAPPER_INPUT_VALUE_CLASS, inputValueClass, Object.class);
    mapperConf.setClass(MAPPER_OUTPUT_KEY_CLASS, outputKeyClass, Object.class);
    mapperConf.setClass(MAPPER_OUTPUT_VALUE_CLASS, outputValueClass, Object.class);
    // serialize the mapper configuration in the chain configuration.
    Stringifier<Configuration> stringifier = new DefaultStringifier<Configuration>(jobConf,
            Configuration.class);
    try {
        jobConf.set(prefix + CHAIN_MAPPER_CONFIG + index, stringifier.toString(new Configuration(mapperConf)));
    } catch (IOException ioEx) {
        throw new RuntimeException(ioEx);
    }

    // increment the chain counter
    jobConf.setInt(prefix + CHAIN_MAPPER_SIZE, index + 1);
}