List of usage examples for org.apache.hadoop.io MapWritable keySet
@Override
public Set<Writable> keySet()
From source file:org.freeeed.mr.FreeEedReducer.java
License:Apache License
private DocumentMetadata getAllMetadata(MapWritable map) { DocumentMetadata metadata = new DocumentMetadata(); Set<Writable> set = map.keySet(); Iterator<Writable> iter = set.iterator(); while (iter.hasNext()) { String name = iter.next().toString(); if (!ParameterProcessing.NATIVE.equals(name) && !ParameterProcessing.NATIVE_AS_PDF.equals(name) && !name.startsWith(ParameterProcessing.NATIVE_AS_HTML)) { // all metadata but native - which is bytes! Text value = (Text) map.get(new Text(name)); metadata.set(name, value.toString()); }/*from www .j a va 2s. c o m*/ } return metadata; }
From source file:smile.wide.AttributeValueHistogram.java
License:Apache License
@Override public int run(String[] arg) throws Exception { if (arg.length < 2) { s_logger.fatal("Usage: AttributeValueHistogram <infile> <outfile>"); // TODO: return an error code? }//from w w w .j av a2 s . co m s_logger.debug("Got " + arg.length + " arguments"); inPath_ = arg[0]; s_logger.info("Input path is " + inPath_); // parse the key-value arguments passed - by now these are the arguments // specific to AttributeValueHistogram for (int i = 1; i < arg.length; ++i) { String[] tokens = arg[i].split("="); if (tokens.length != 2) { s_logger.fatal("Can't parse argument" + arg[i]); } if (tokens[0].equals("xdata.bayesnets.datasetreader.class")) { readerClass_ = tokens[1].trim(); s_logger.debug("Set reader class to " + readerClass_); } else if (tokens[0].equals("xdata.bayesnets.datasetreader.filter")) { readerFilter_ = tokens[1].trim(); s_logger.debug("Set reader filter to " + readerFilter_); } else if (tokens[0].equals("xdata.bayesnets.datasetreader.instid")) { readerInstID_ = tokens[1].trim(); s_logger.debug("Set reader's instance ID column to " + readerInstID_); } else if (tokens[0].equals("xdata.bayesnets.datasetreader.variablenames")) { variableNames_ = tokens[1].trim(); s_logger.debug("Set reader's variable names to " + variableNames_); } else { s_logger.warn("Unknown argument " + arg[i]); } } conf_ = getConf(); // pass the reader class to the mapper, in jobconf // TODO: use setClass here - fails early if wrong, not in the mapper conf_.set("xdata.bayesnets.datasetreader.class", readerClass_); conf_.set("xdata.bayesnets.datasetreader.filter", readerFilter_); // conf_.set("xdata.bayesnets.datasetreader.instid", readerInstID_); // not used conf_.set("xdata.bayesnets.datasetreader.variablenames", variableNames_); conf_.setBoolean("mapred.compress.map.output", true); // compress intermediate data conf_.set("mapred.output.compression.type", CompressionType.BLOCK.toString()); // by block, to keep splittable conf_.setClass("mapred.map.output.compression.codec", GzipCodec.class, CompressionCodec.class); // for debugging conf_.set("keep.failed.task.files", "true"); conf_.set("keep.failed.task.pattern", "*"); Job job = new Job(conf_); job.setJarByClass(AttributeValueHistogram.class); // use this jar job.setJobName("Collect value histograms by attribute"); FileInputFormat.addInputPath(job, new Path(inPath_)); int rnd = (new Random()).nextInt(); lastWorkingDir_ = job.getWorkingDirectory().toUri(); s_logger.info("Job working directory is " + lastWorkingDir_); String tempDirName = job.getWorkingDirectory() + "/tmp/attvalhist" + rnd + ".tmp"; s_logger.info("Temp files in directory " + tempDirName); FileOutputFormat.setOutputPath(job, new Path(tempDirName)); job.setMapperClass(AttributeValueHistogramMapper.class); job.setCombinerClass(AttributeValueHistogramReducer.class); job.setReducerClass(AttributeValueHistogramReducer.class); // set both the map and reduce in/out classes job.setOutputKeyClass(Text.class); // the name of the attribute job.setOutputValueClass(MapWritable.class); // Value -> count map job.setOutputFormatClass(SequenceFileOutputFormat.class); // run'em int result = job.waitForCompletion(true) ? 0 : 16; // retain the temp file, collect the output attributeValues_ = new TreeMap<String, Map<String, Integer>>(); FileSystem fs = FileSystem.get(conf_); SequenceFile.Reader reader = null; Path resPath = new Path(tempDirName); FileStatus[] stats = fs.listStatus(resPath); // read all output files for (FileStatus stat : stats) { if (stat.getPath().toUri().toString().contains("part-r-")) try { s_logger.info("Reading results from " + stat.getPath()); reader = new SequenceFile.Reader(fs, stat.getPath(), conf_); // Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf_); // MapWritable value = (MapWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf_); Text key = new Text(); MapWritable value = new MapWritable(); while (reader.next(key, value)) { TreeMap<String, Integer> valueCounts = new TreeMap<String, Integer>(); for (Writable attValue : value.keySet()) { valueCounts.put(((Text) attValue).toString(), ((IntWritable) (value.get(attValue))).get()); } attributeValues_.put(key.toString(), valueCounts); } } finally { IOUtils.closeStream(reader); } } fs.deleteOnExit(resPath); return result; }
From source file:smile.wide.AttributeValueHistogramReducer.java
License:Apache License
@Override public void reduce(Text key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException { // Let's have a map and internally collect them int maps = 0; int vals = 0; HashMap<Text, Integer> myMap = new HashMap<Text, Integer>(); for (MapWritable m : values) { maps++;/* w w w .j a v a 2 s. c o m*/ for (Writable valName : m.keySet()) { Text val = (Text) valName; Integer count = ((IntWritable) (m.get(valName))).get(); if (myMap.containsKey(val)) { myMap.put(val, myMap.get(val) + count); } else { myMap.put(val, count); vals++; } } } s_logger.debug("Reducer/combiner got " + maps + " maps, with a total of " + vals + " distinct values for attribute `" + key + "`"); // now output // key is key // value is myMap as MapWritable<Text, IntWritable> MapWritable output = new MapWritable(); for (Text t : myMap.keySet()) { s_logger.debug("Outputting count " + myMap.get(t) + " for attribute " + t); output.put(t, new IntWritable(myMap.get(t))); } context.write(key, output); }