List of usage examples for org.apache.hadoop.io Stringifier toString
public String toString(T obj) throws IOException;
From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorDriver.java
License:Apache License
/** * Run the job/*from w ww . j av a 2 s .c o m*/ * * @param input * the input pathname String * @param output * the output pathname String * @param catFile * the file containing the Wikipedia categories * @param exactMatchOnly * if true, then the Wikipedia category must match exactly instead of simply containing the * category string */ public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, Class<? extends Analyzer> analyzerClass) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("key.value.separator.in.input.line", " "); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); conf.setBoolean("exact.match.only", exactMatchOnly); conf.set("analyzer.class", analyzerClass.getName()); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters can make or break a piece of code Set<String> categories = Sets.newHashSet(); for (String line : new FileLineIterable(new File(catFile))) { categories.add(line.trim().toLowerCase(Locale.ENGLISH)); } Stringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories)); String categoriesStr = setStringifier.toString(categories); conf.set("wikipedia.categories", categoriesStr); Job job = new Job(conf); log.info("Input: {} Out: {} Categories: {}", input, output, catFile); job.setJarByClass(WikipediaDatasetCreatorDriver.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WikipediaDatasetCreatorMapper.class); //TODO: job.setNumMapTasks(100); job.setInputFormatClass(XmlInputFormat.class); job.setReducerClass(WikipediaDatasetCreatorReducer.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.text.WikipediaToSequenceFile.java
License:Apache License
/** * Run the job/*from w ww . ja va 2s. c o m*/ * * @param input * the input pathname String * @param output * the output pathname String * @param catFile * the file containing the Wikipedia categories * @param exactMatchOnly * if true, then the Wikipedia category must match exactly instead of simply containing the * category string * @param all * if true select all categories * @param removeLabels * if true remove Category labels from document text after extracting. * */ public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all, boolean removeLabels) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); conf.setBoolean("exact.match.only", exactMatchOnly); conf.setBoolean("all.files", all); conf.setBoolean("remove.labels", removeLabels); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Set<String> categories = Sets.newHashSet(); if (!catFile.isEmpty()) { for (String line : new FileLineIterable(new File(catFile))) { categories.add(line.trim().toLowerCase(Locale.ENGLISH)); } } Stringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories)); String categoriesStr = setStringifier.toString(categories); conf.set("wikipedia.categories", categoriesStr); Job job = new Job(conf); log.info("Input: {} Out: {} Categories: {} All Files: {}", input, output, catFile, all); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(WikipediaMapper.class); job.setInputFormatClass(XmlInputFormat.class); job.setReducerClass(Reducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(WikipediaToSequenceFile.class); /* * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type", * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type", * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); */ HadoopUtil.delete(conf, outPath); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.warcbase.mapreduce.lib.Chain.java
License:Apache License
protected static void setMapperConf(Configuration jobConf, Class<?> inputKeyClass, Class<?> inputValueClass, Class<?> outputKeyClass, Class<?> outputValueClass, Configuration mapperConf, int index, String prefix) {/* w w w . j a va2 s .c om*/ // if the Mapper does not have a configuration, create an empty one if (mapperConf == null) { // using a Configuration without defaults to make it lightweight. // still the chain's conf may have all defaults and this conf is // overlapped to the chain configuration one. mapperConf = new Configuration(true); } // store the input/output classes of the mapper in the mapper conf mapperConf.setClass(MAPPER_INPUT_KEY_CLASS, inputKeyClass, Object.class); mapperConf.setClass(MAPPER_INPUT_VALUE_CLASS, inputValueClass, Object.class); mapperConf.setClass(MAPPER_OUTPUT_KEY_CLASS, outputKeyClass, Object.class); mapperConf.setClass(MAPPER_OUTPUT_VALUE_CLASS, outputValueClass, Object.class); // serialize the mapper configuration in the chain configuration. Stringifier<Configuration> stringifier = new DefaultStringifier<Configuration>(jobConf, Configuration.class); try { jobConf.set(prefix + CHAIN_MAPPER_CONFIG + index, stringifier.toString(new Configuration(mapperConf))); } catch (IOException ioEx) { throw new RuntimeException(ioEx); } // increment the chain counter jobConf.setInt(prefix + CHAIN_MAPPER_SIZE, index + 1); }