List of usage examples for org.apache.hadoop.io MapWritable MapWritable
public MapWritable()
From source file:org.wonderbee.elasticsearch.hive.ElasticSearchSerDe.java
License:Apache License
@Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { StructObjectInspector outputRowOI = (StructObjectInspector) objInspector; List<? extends StructField> outputFieldRefs = outputRowOI.getAllStructFieldRefs(); MapWritable record = new MapWritable(); String isJson = props.getProperty(ES_IS_JSON); if ("true".equalsIgnoreCase(isJson)) { throw new SerDeException("Json mode not yet supported"); }//from ww w .j a va 2s .c om // Handle delimited records (ie. isJson == false) for (int c = 0; c < numColumns; c++) { try { Object field = outputRowOI.getStructFieldData(obj, outputFieldRefs.get(c)); ObjectInspector fieldOI = outputFieldRefs.get(c).getFieldObjectInspector(); PrimitiveObjectInspector fieldStringOI = (PrimitiveObjectInspector) fieldOI; String columnName = columnNames.get(c); record.put(new Text(columnName), (Writable) fieldStringOI.getPrimitiveWritableObject(field)); } catch (NullPointerException e) { //LOG.info("Increment null field counter."); } } return record; }
From source file:org.wonderbee.elasticsearch.hive.ElasticSearchSerDe.java
License:Apache License
/** * Recursively converts an arbitrary object into the appropriate writable. Please enlighten me if there is an existing * method for doing this.// w ww.j a v a2s. c o m */ private Writable toWritable(Object thing) { if (thing instanceof String) { return new Text((String) thing); } else if (thing instanceof Long) { return new LongWritable((Long) thing); } else if (thing instanceof Integer) { return new IntWritable((Integer) thing); } else if (thing instanceof Double) { return new DoubleWritable((Double) thing); } else if (thing instanceof Float) { return new FloatWritable((Float) thing); } else if (thing instanceof Boolean) { return new BooleanWritable((Boolean) thing); } else if (thing instanceof Map) { MapWritable result = new MapWritable(); for (Map.Entry<String, Object> entry : ((Map<String, Object>) thing).entrySet()) { result.put(new Text(entry.getKey().toString()), toWritable(entry.getValue())); } return result; } else if (thing instanceof List) { if (((List) thing).size() > 0) { Object first = ((List) thing).get(0); Writable[] listOfThings = new Writable[((List) thing).size()]; for (int i = 0; i < listOfThings.length; i++) { listOfThings[i] = toWritable(((List) thing).get(i)); } return new ArrayWritable(toWritable(first).getClass(), listOfThings); } } return NullWritable.get(); }
From source file:smile.wide.AttributeValueHistogram.java
License:Apache License
@Override public int run(String[] arg) throws Exception { if (arg.length < 2) { s_logger.fatal("Usage: AttributeValueHistogram <infile> <outfile>"); // TODO: return an error code? }/*from w ww .jav a 2 s . c o m*/ s_logger.debug("Got " + arg.length + " arguments"); inPath_ = arg[0]; s_logger.info("Input path is " + inPath_); // parse the key-value arguments passed - by now these are the arguments // specific to AttributeValueHistogram for (int i = 1; i < arg.length; ++i) { String[] tokens = arg[i].split("="); if (tokens.length != 2) { s_logger.fatal("Can't parse argument" + arg[i]); } if (tokens[0].equals("xdata.bayesnets.datasetreader.class")) { readerClass_ = tokens[1].trim(); s_logger.debug("Set reader class to " + readerClass_); } else if (tokens[0].equals("xdata.bayesnets.datasetreader.filter")) { readerFilter_ = tokens[1].trim(); s_logger.debug("Set reader filter to " + readerFilter_); } else if (tokens[0].equals("xdata.bayesnets.datasetreader.instid")) { readerInstID_ = tokens[1].trim(); s_logger.debug("Set reader's instance ID column to " + readerInstID_); } else if (tokens[0].equals("xdata.bayesnets.datasetreader.variablenames")) { variableNames_ = tokens[1].trim(); s_logger.debug("Set reader's variable names to " + variableNames_); } else { s_logger.warn("Unknown argument " + arg[i]); } } conf_ = getConf(); // pass the reader class to the mapper, in jobconf // TODO: use setClass here - fails early if wrong, not in the mapper conf_.set("xdata.bayesnets.datasetreader.class", readerClass_); conf_.set("xdata.bayesnets.datasetreader.filter", readerFilter_); // conf_.set("xdata.bayesnets.datasetreader.instid", readerInstID_); // not used conf_.set("xdata.bayesnets.datasetreader.variablenames", variableNames_); conf_.setBoolean("mapred.compress.map.output", true); // compress intermediate data conf_.set("mapred.output.compression.type", CompressionType.BLOCK.toString()); // by block, to keep splittable conf_.setClass("mapred.map.output.compression.codec", GzipCodec.class, CompressionCodec.class); // for debugging conf_.set("keep.failed.task.files", "true"); conf_.set("keep.failed.task.pattern", "*"); Job job = new Job(conf_); job.setJarByClass(AttributeValueHistogram.class); // use this jar job.setJobName("Collect value histograms by attribute"); FileInputFormat.addInputPath(job, new Path(inPath_)); int rnd = (new Random()).nextInt(); lastWorkingDir_ = job.getWorkingDirectory().toUri(); s_logger.info("Job working directory is " + lastWorkingDir_); String tempDirName = job.getWorkingDirectory() + "/tmp/attvalhist" + rnd + ".tmp"; s_logger.info("Temp files in directory " + tempDirName); FileOutputFormat.setOutputPath(job, new Path(tempDirName)); job.setMapperClass(AttributeValueHistogramMapper.class); job.setCombinerClass(AttributeValueHistogramReducer.class); job.setReducerClass(AttributeValueHistogramReducer.class); // set both the map and reduce in/out classes job.setOutputKeyClass(Text.class); // the name of the attribute job.setOutputValueClass(MapWritable.class); // Value -> count map job.setOutputFormatClass(SequenceFileOutputFormat.class); // run'em int result = job.waitForCompletion(true) ? 0 : 16; // retain the temp file, collect the output attributeValues_ = new TreeMap<String, Map<String, Integer>>(); FileSystem fs = FileSystem.get(conf_); SequenceFile.Reader reader = null; Path resPath = new Path(tempDirName); FileStatus[] stats = fs.listStatus(resPath); // read all output files for (FileStatus stat : stats) { if (stat.getPath().toUri().toString().contains("part-r-")) try { s_logger.info("Reading results from " + stat.getPath()); reader = new SequenceFile.Reader(fs, stat.getPath(), conf_); // Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf_); // MapWritable value = (MapWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf_); Text key = new Text(); MapWritable value = new MapWritable(); while (reader.next(key, value)) { TreeMap<String, Integer> valueCounts = new TreeMap<String, Integer>(); for (Writable attValue : value.keySet()) { valueCounts.put(((Text) attValue).toString(), ((IntWritable) (value.get(attValue))).get()); } attributeValues_.put(key.toString(), valueCounts); } } finally { IOUtils.closeStream(reader); } } fs.deleteOnExit(resPath); return result; }
From source file:smile.wide.AttributeValueHistogramMapper.java
License:Apache License
@SuppressWarnings("unchecked") @Override//from w w w. j a v a2 s . co m public void map(LongWritable offsetkey, Text value, Context context) { if (initializing_) { conf_ = context.getConfiguration(); fileReaderClass_ = conf_.get("xdata.bayesnets.datasetreader.class"); fileReaderFilter_ = conf_.get("xdata.bayesnets.datasetreader.filter"); columnNames_ = conf_.get("xdata.bayesnets.datasetreader.variablenames").split(","); assertEquals(columnNames_.length, fileReaderFilter_.split(",").length); try { Object r = Class.forName(fileReaderClass_).newInstance(); reader_ = (DataSetReader<Integer, String>) r; } catch (InstantiationException e) { s_logger.error("Instantiation exception for DataSetReader " + fileReaderClass_); e.printStackTrace(); System.exit(1); } catch (IllegalAccessException e) { s_logger.error("IllegalAccess exception for DataSetReader " + fileReaderClass_); e.printStackTrace(); System.exit(1); } catch (ClassNotFoundException e) { s_logger.error("ClassDefNotFoundException for DataSetReader " + fileReaderClass_); e.printStackTrace(); System.exit(1); } catch (ClassCastException e) { s_logger.error("ClassCastException for DataSetReader " + fileReaderClass_); e.printStackTrace(); System.exit(1); } reader_.setFilter(fileReaderFilter_); reader_.setInstanceIDColumn(1); // doesn't matter, won't use initializing_ = false; } // we're initialized Instance<Integer, String> inst = reader_.parseLine(value.toString()); String[] vals = inst.getValue(); try { for (int i = 0; i < vals.length; ++i) { MapWritable mw = new MapWritable(); mw.put(new Text(vals[i]), new IntWritable(1)); context.write(new Text(columnNames_[i]), mw); } } catch (IOException e) { s_logger.error("I/O exception writing the map output"); e.printStackTrace(); } catch (InterruptedException e) { s_logger.error("Interrupted writing the map output"); e.printStackTrace(); } catch (NullPointerException e) { s_logger.error("Null pointer, probably unexpected data"); s_logger.error("Instance ID = " + inst.getID()); for (int i = 0; i < inst.getValue().length; ++i) { s_logger.error("Attribute_" + i + " = " + inst.getValue()[i]); } ; } }
From source file:smile.wide.AttributeValueHistogramReducer.java
License:Apache License
@Override public void reduce(Text key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException { // Let's have a map and internally collect them int maps = 0; int vals = 0; HashMap<Text, Integer> myMap = new HashMap<Text, Integer>(); for (MapWritable m : values) { maps++;//from w w w . j a v a 2s . c o m for (Writable valName : m.keySet()) { Text val = (Text) valName; Integer count = ((IntWritable) (m.get(valName))).get(); if (myMap.containsKey(val)) { myMap.put(val, myMap.get(val) + count); } else { myMap.put(val, count); vals++; } } } s_logger.debug("Reducer/combiner got " + maps + " maps, with a total of " + vals + " distinct values for attribute `" + key + "`"); // now output // key is key // value is myMap as MapWritable<Text, IntWritable> MapWritable output = new MapWritable(); for (Text t : myMap.keySet()) { s_logger.debug("Outputting count " + myMap.get(t) + " for attribute " + t); output.put(t, new IntWritable(myMap.get(t))); } context.write(key, output); }
From source file:tl.lin.data.benchmark.basic.BenchmarkHashMapWritable.java
License:Apache License
/** * Runs this benchmark.// w w w.j av a2 s. c o m */ public static void main(String[] args) throws Exception { long startTime = System.currentTimeMillis(); int numTrials = 100000; Random rand = new Random(); ByteArrayOutputStream[] storageHashMapWritable = new ByteArrayOutputStream[numTrials]; for (int i = 0; i < numTrials; i++) { HashMapWritable<IntWritable, IntWritable> map = new HashMapWritable<IntWritable, IntWritable>(); int size = rand.nextInt(50) + 50; for (int j = 0; j < size; j++) { map.put(new IntWritable(rand.nextInt(10000)), new IntWritable(rand.nextInt(10))); } ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); DataOutputStream dataOut = new DataOutputStream(bytesOut); map.write(dataOut); storageHashMapWritable[i] = bytesOut; } System.out.println("Generating and serializing " + numTrials + " random HashMapWritables: " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); startTime = System.currentTimeMillis(); ByteArrayOutputStream[] storageMapWritable = new ByteArrayOutputStream[numTrials]; for (int i = 0; i < numTrials; i++) { MapWritable map = new MapWritable(); int size = rand.nextInt(50) + 50; for (int j = 0; j < size; j++) { map.put(new IntWritable(rand.nextInt(10000)), new IntWritable(rand.nextInt(10))); } ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); DataOutputStream dataOut = new DataOutputStream(bytesOut); map.write(dataOut); storageMapWritable[i] = bytesOut; } System.out.println("Generating and serializing " + numTrials + " random MapWritables: " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); float cntA = 0.0f; float cntB = 0.0f; for (int i = 0; i < numTrials; i++) { cntA += storageHashMapWritable[i].size(); cntB += storageMapWritable[i].size(); } System.out.println("Average size of each HashMapWritable: " + cntA / numTrials); System.out.println("Average size of each MapWritable: " + cntB / numTrials); startTime = System.currentTimeMillis(); for (int i = 0; i < numTrials; i++) { HashMapWritable<IntWritable, IntWritable> map = new HashMapWritable<IntWritable, IntWritable>(); map.readFields(new DataInputStream(new ByteArrayInputStream(storageHashMapWritable[i].toByteArray()))); } System.out.println("Deserializing " + numTrials + " random HashMapWritables: " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); startTime = System.currentTimeMillis(); for (int i = 0; i < numTrials; i++) { MapWritable map = new MapWritable(); map.readFields(new DataInputStream(new ByteArrayInputStream(storageMapWritable[i].toByteArray()))); } System.out.println("Deserializing " + numTrials + " random MapWritables: " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); }
From source file:uk.ac.cam.eng.extraction.hadoop.util.ExtractorDataLoader.java
License:Apache License
/** * Loads word aligned parallel text to HDFS. * // ww w .j ava 2 s .c o m * @param sourceTextFile The source text file, gzipped, with one sentence * per line, same number of lines as targetTextFile. * @param targetTextFile The target text file, gzipped, with one sentence * per line, same number of lines as sourceTextFile. * @param wordAlignmentFile The word alignment file, gzipped, one alignment * per line in Berkeley format ("0-0<SPACE>1-2, etc.", zero-based source * index on the left), same number of lines as sourceTextFile. * @param provenanceFile The provenance file, gzipped, one set of * provenances per line with format "prov1<SPACE>prov2, etc.", same number * of lines as sourceTextFile. * @param hdfsName * @throws IOException */ public void loadTrainingData2Hdfs(String sourceTextFile, String targetTextFile, String wordAlignmentFile, String provenanceFile, String hdfsName) throws FileNotFoundException, IOException { try (BufferedReader src = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(sourceTextFile)))); BufferedReader trg = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(targetTextFile)))); BufferedReader align = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(wordAlignmentFile)))); BufferedReader prov = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(provenanceFile))))) { String srcLine = null, trgLine = null, alignLine = null, provLine = null; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(hdfsName); try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, MapWritable.class, TextArrayWritable.class)) { Text sourceSentenceText = new Text(); Text targetSentenceText = new Text(); Text alignmentText = new Text(); Text[] array = new Text[3]; array[0] = sourceSentenceText; array[1] = targetSentenceText; array[2] = alignmentText; TextArrayWritable arrayWritable = new TextArrayWritable(); // metadata: provenance, e.g. genre, collection, training // instance // id, doc id, etc. MapWritable metadata = new MapWritable(); while ((srcLine = src.readLine()) != null && (trgLine = trg.readLine()) != null && (alignLine = align.readLine()) != null && (provLine = prov.readLine()) != null) { metadata.clear(); String[] provenances = provLine.split("\\s+"); for (String provenance : provenances) { metadata.put(new Text(provenance), NullWritable.get()); } sourceSentenceText.set(srcLine); targetSentenceText.set(trgLine); // note, alignLine can be the empty string alignmentText.set(alignLine); arrayWritable.set(array); writer.append(metadata, arrayWritable); } } } }