List of usage examples for org.apache.hadoop.io NullWritable get
public static NullWritable get()
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java
License:Apache License
@Override public void putNext(Tuple t) throws IOException { try {// w ww. j a va 2 s .c om this.writer.write(NullWritable.get(), t.getAll().size() == 1 ? t.get(0) : t); } catch (InterruptedException e) { e.printStackTrace(); } }
From source file:com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseTest.java
License:Apache License
@Test public void testAggregationPhase() throws Exception { int recordCount = 0; List<GenericRecord> inputRecords = generateTestMapperData(); for (GenericRecord record : inputRecords) { AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>(); inKey.datum(record);// www .j a va 2s .c om mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get())); recordCount++; } List<Pair<BytesWritable, BytesWritable>> mapResult = mapDriver.run(); Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount, mapResult.size()); AggregationPhaseMapOutputKey keyWrapper = AggregationPhaseMapOutputKey .fromBytes(mapResult.get(0).getFirst().getBytes()); Assert.assertEquals(406058, keyWrapper.getTime()); keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(1).getFirst().getBytes()); Assert.assertEquals(406058, keyWrapper.getTime()); keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(2).getFirst().getBytes()); Assert.assertEquals(406059, keyWrapper.getTime()); List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(mapResult); reduceDriver.addAll(reduceInput); List<Pair<AvroKey<GenericRecord>, NullWritable>> reduceResult = reduceDriver.run(); Assert.assertEquals("Incorrect number of records returned by aggregation reducer", 2, reduceResult.size()); GenericRecord record = reduceResult.get(0).getFirst().datum(); List<String> dimensionsExpected = Lists.newArrayList("abc1", "pqr1", "xyz1"); List<String> dimensionsActual = getDimensionsFromRecord(record); Assert.assertEquals(dimensionsExpected, dimensionsActual); List<Integer> metricsExpected = Lists.newArrayList(200, 40); List<Integer> metricsActual = getMetricsFromRecord(record); Assert.assertEquals(metricsExpected, metricsActual); Assert.assertEquals(406058, (long) record.get("hoursSinceEpoch")); record = reduceResult.get(1).getFirst().datum(); dimensionsExpected = Lists.newArrayList("abc2", "pqr2", "xyz2"); dimensionsActual = getDimensionsFromRecord(record); Assert.assertEquals(dimensionsExpected, dimensionsActual); metricsExpected = Lists.newArrayList(10, 2); metricsActual = getMetricsFromRecord(record); Assert.assertEquals(metricsExpected, metricsActual); Assert.assertEquals(406059, (long) record.get("hoursSinceEpoch")); }
From source file:com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnNoTransformationTest.java
License:Apache License
@Test public void testTopKColumnTransformationPhase() throws Exception { int recordCount = 0; List<GenericRecord> inputRecords = generateTestData(); for (GenericRecord record : inputRecords) { AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>(); inKey.datum(record);/* w ww . j a v a 2 s. c o m*/ mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get())); recordCount++; } resetAvroSerialization(); List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run(); Assert.assertEquals(recordCount, result.size()); for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) { GenericRecord datum = pair.getFirst().datum(); System.out.println(datum.getSchema().getFields().size()); Assert.assertEquals( "Input records must contain same number of fields as output record, when schemas are not transformed", datum.getSchema().getFields().size(), 6); } }
From source file:com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationTest.java
License:Apache License
@Test public void testTopKColumnTransformationPhase() throws Exception { int recordCount = 0; List<GenericRecord> inputRecords = generateTestData(); for (GenericRecord record : inputRecords) { AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>(); inKey.datum(record);/* w ww. j ava 2 s .c o m*/ mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get())); recordCount++; } resetAvroSerialization(); List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run(); Assert.assertEquals(recordCount, result.size()); for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) { GenericRecord datum = pair.getFirst().datum(); Assert.assertEquals("TopKTransformationJob did not add new column for topk column", datum.getSchema().getField("d2_topk") != null, true); String d2 = (String) datum.get("d2"); String d2_topk = (String) datum.get("d2_topk"); Assert.assertEquals("Incorrect topk column transformation", (d2_topk.equals("other") && d2.equals("pqr1")) || (d2_topk.equals("pqr2") && d2.equals("pqr2")), true); } }
From source file:com.linkedin.thirdeye.hadoop.topk.TopkPhaseTest.java
License:Apache License
@Test public void testTopKColumnTransformationPhase() throws Exception { int recordCount = 0; List<GenericRecord> inputRecords = generateTestMapperData(); for (GenericRecord record : inputRecords) { AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>(); inKey.datum(record);// ww w.j a v a2s . c o m mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get())); recordCount++; } List<Pair<BytesWritable, BytesWritable>> result = mapDriver.run(); // for each record, we emit 2 records per dimension: // once for actual value of dimension, once for ALL,ALL Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount * 3 * 2, result.size()); Map<String, Integer> counts = new HashMap<>(); for (Pair<BytesWritable, BytesWritable> pair : result) { TopKPhaseMapOutputKey key = TopKPhaseMapOutputKey.fromBytes(pair.getFirst().getBytes()); String dimensionName = key.getDimensionName(); Integer count = counts.get(dimensionName); if (count == null) { count = 0; } counts.put(dimensionName, count + 1); } Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d1")); Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d2")); Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d3")); Assert.assertEquals("Incorrect number of records emitted from map", 6, (int) counts.get("0")); List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(result); reduceDriver.addAll(reduceInput); reduceDriver.run(); File topKFile = new File(outputPath, ThirdEyeConstants.TOPK_VALUES_FILE); Assert.assertTrue("Topk file failed to generate!", topKFile.exists()); TopKDimensionValues topk = OBJECT_MAPPER.readValue(new FileInputStream(topKFile), TopKDimensionValues.class); Map<String, Set<String>> topkMap = topk.getTopKDimensions(); Assert.assertEquals("Incorrect topk object", topkMap.size(), 1); Assert.assertEquals("Incorrect topk values in topk object", Sets.newHashSet("pqr1"), topkMap.get("d2")); Assert.assertEquals("Incorrect whitelist values in topk object", null, topkMap.get("d3")); }
From source file:com.linkedin.whiteelephant.mapreduce.MyAvroMultipleOutputs.java
License:Apache License
/** * Write key and value to the namedOutput. * * Output path is a unique file generated for the namedOutput. * For example, {namedOutput}-(m|r)-{part-number} * /*from w ww . ja v a 2s. co m*/ * @param namedOutput the named output name * @param key the key , value is NullWritable */ @SuppressWarnings("unchecked") public void write(String namedOutput, Object key) throws IOException, InterruptedException { write(namedOutput, key, NullWritable.get(), namedOutput); }
From source file:com.m6d.filecrush.crush.CrushReducer.java
License:Apache License
@Override public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException { String bucket = bucketId.toString(); String dirName = bucket.substring(0, bucket.lastIndexOf('-')); int idx = findMatcher(dirName); String outputFileName = calculateOutputFile(idx, dirName); /*//from w w w . j a v a 2 s .c o m * Don't need to separate the paths because the output file name is already absolute. */ valueOut.set(outDirPath + outputFileName); LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName)); /* * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir. */ RecordWriter<Object, Object> sink = null; FileSinkOperator.RecordWriter parquetSink = null; Exception rootCause = null; Void voidKey = null; Object key = null; Object value = null; String schemaSignature = null; String columns = null; String columnsTypes = null; Properties jobProperties = new Properties(); boolean firstFile = true; try { while (null == rootCause && values.hasNext()) { Text srcFile = values.next(); Path inputPath = new Path(srcFile.toString()); RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter); if (firstFile) { firstFile = false; key = reader.createKey(); if (null == key) key = NullWritable.get(); value = reader.createValue(); if (AvroContainerInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) { schemaSignature = getAvroFileSchemaString(job, inputPath); job.set("avro.schema.literal", schemaSignature); } else if (MapredParquetInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) { MessageType schema = getParquetFileSchema(job, inputPath); List<Type> fieldsFromSchema = schema.getFields(); for (Type field : fieldsFromSchema) { if (field.getOriginalType() != null) { if (StringUtils.equals(field.getOriginalType().toString(), "DECIMAL")) { String primitiveType = field.asPrimitiveType().toString(); int loc = primitiveType.indexOf("DECIMAL"); int start = loc + 7; int end = primitiveType.indexOf(")", loc) + 1; String ps = primitiveType.substring(start, end); if (!decimalTypesHashMap.containsKey(ps)) { decimalTypesHashMap.put(field.getName().toString(), ps); } } } } schemaSignature = getParquetFileSchemaString(job, inputPath); StringBuilder columnsSb = new StringBuilder(); StringBuilder columnsTypesSb = new StringBuilder(); boolean firstColumn = true; for (ColumnDescriptor col : schema.getColumns()) { if (firstColumn) { firstColumn = false; } else { columnsSb.append(","); columnsTypesSb.append(","); } columnsSb.append(col.getPath()[0]); String typeName = col.getType().toString(); if ("INT96".equals(typeName)) typeName = "timestamp"; else if ("INT64".equals(typeName)) typeName = "bigint"; else if ("INT32".equals(typeName)) typeName = "int"; else if ("INT16".equals(typeName)) typeName = "smallint"; else if ("INT8".equals(typeName)) typeName = "tinyint"; else if ("BINARY".equals(typeName)) typeName = "string"; else if ("BOOLEAN".equals(typeName)) typeName = "boolean"; else if ("DOUBLE".equals(typeName)) typeName = "double"; else if ("FLOAT".equals(typeName)) typeName = "float"; else if (typeName.startsWith("FIXED_LEN_BYTE_ARRAY")) { String column = col.toString(); int start = column.indexOf('[') + 1; int end = column.indexOf(']'); String fieldName = column.substring(start, end); String lookupVal = decimalTypesHashMap.get(fieldName); LOG.info("final string: decimal" + lookupVal); typeName = "decimal" + lookupVal; } columnsTypesSb.append(typeName); } columns = columnsSb.toString(); columnsTypes = columnsTypesSb.toString(); jobProperties.put(IOConstants.COLUMNS, columns); jobProperties.put(IOConstants.COLUMNS_TYPES, columnsTypes); parquetSerDe = new ParquetHiveSerDe(); parquetSerDe.initialize(job, jobProperties); } else { schemaSignature = key.getClass().getName() + ":" + value.getClass().getName(); } /* * Set the key and value class in the conf, which the output format uses to get type information. */ job.setOutputKeyClass(key.getClass()); job.setOutputValueClass(value.getClass()); /* * Output file name is absolute so we can just add it to the crush prefix. */ if (MapredParquetOutputFormat.class.isAssignableFrom(getOutputFormatClass(idx))) { outputFormat = "parquet"; parquetSink = createParquetRecordWriter(idx, valueOut.toString(), jobProperties, (Class<? extends org.apache.hadoop.io.Writable>) value.getClass(), reporter); } else { outputFormat = getOutputFormatClass(idx).getName(); sink = createRecordWriter(idx, valueOut.toString()); } } else { // next files /* * Ensure schema signature is the same as the first file's */ String nextSchemaSignature = null; if (AvroContainerInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) { nextSchemaSignature = getAvroFileSchemaString(job, inputPath); } else if (MapredParquetInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) { nextSchemaSignature = getParquetFileSchemaString(job, inputPath); } else { Object otherKey = reader.createKey(); if (otherKey == null) otherKey = NullWritable.get(); nextSchemaSignature = otherKey.getClass().getName() + ":" + reader.createValue().getClass().getName(); } if (!schemaSignature.equals(nextSchemaSignature)) { throw new IllegalArgumentException( format("Heterogeneous schema detected in file %s: [%s] != [%s]", inputPath, nextSchemaSignature, schemaSignature)); } } boolean ret; if ("parquet".equals(outputFormat)) ret = reader.next(voidKey, value); else ret = reader.next(key, value); while (ret) { if ("text".equals(inputFormat)) sink.write(key, null); else if (sink != null) sink.write(key, value); else { ParquetHiveRecord parquetHiveRecord = new ParquetHiveRecord(value, (StructObjectInspector) parquetSerDe.getObjectInspector()); parquetSink.write(parquetHiveRecord); } reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1); if ("parquet".equals(outputFormat)) ret = reader.next(voidKey, value); else ret = reader.next(key, value); } // /* * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir. */ collector.collect(srcFile, valueOut); reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1); recordNumber++; if (reportRecordNumber == recordNumber) { reportRecordNumber += reportRecordNumber; reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath)); } } } catch (Exception e) { rootCause = e; } finally { if (null != sink) { try { sink.close(reporter); } catch (Exception e) { if (null == rootCause) { rootCause = e; } else { LOG.error("Swallowing exception on close of " + outputFileName, e); } } } if (null != parquetSink) { try { parquetSink.close(false); } catch (Exception e) { if (null == rootCause) { rootCause = e; } else { LOG.error("Swallowing exception on close of " + outputFileName, e); } } } /* * Let the exception bubble up with a minimum of wrapping. */ if (null != rootCause) { if (rootCause instanceof RuntimeException) { throw (RuntimeException) rootCause; } if (rootCause instanceof IOException) { throw (IOException) rootCause; } throw new RuntimeException(rootCause); } } }
From source file:com.m6d.hive.protobuf.LongTest.java
License:Apache License
public void testBigDat() throws Exception { Path p = new Path(this.ROOT_DIR, "reallybigflat"); SequenceFile.Writer w = SequenceFile.createWriter(this.getFileSystem(), new Configuration(), p, NullWritable.class, Text.class, SequenceFile.CompressionType.BLOCK); long startLoad = System.currentTimeMillis(); int toLoad = load; for (int i = 0; i < toLoad; i++) { Text t = new Text(); //t.set("ed\ted@email.com\t1\tjava\tbob\tbob@email.com\t3\tbball"); t.set(randomString() + "\t" + randomString() + "\t" + randomInt() + "\t" + randomString() + "\t" + randomString() + "\t" + randomString() + "\t" + randomInt() + "\t" + randomString()); w.append(NullWritable.get(), t); }/* ww w. j ava 2 s . co m*/ w.close(); System.out.println("len " + this.getFileSystem().getFileStatus(p).getLen()); long endLoad = System.currentTimeMillis(); System.out.println((endLoad - startLoad) + " time taken loading"); String jarFile; jarFile = KVAsVSeqFileBinaryInputFormat.class.getProtectionDomain().getCodeSource().getLocation().getFile(); client.execute("add jar " + jarFile); client.execute("set hive.aux.jars.path=file:///" + jarFile); client.execute("create table bigtext " + "(name string, email string , id int , hobby string, " + " name1 string, email1 string, id1 int , hobby1 string)" + " ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS SEQUENCEFILE"); client.execute("load data local inpath '" + p.toString() + "' into table bigtext"); long startQuery = System.currentTimeMillis(); //client.execute( "select distinct(name) from bigtext"); //List<String> result = client.fetchAll(); //Assert.assertEquals("edward", result); client.execute("SELECT count(1) FROM bigtext"); List<String> results = client.fetchAll(); Assert.assertEquals(toLoad + "", results.get(0)); long endQuery = System.currentTimeMillis(); System.out.println((endQuery - startQuery) + " Query time taken"); client.execute("drop table bigproto"); }
From source file:com.marcolotz.lung.io.inputFormat.MultipleFilesRecordReader.java
License:Creative Commons License
/*** * There is no key in this case. One can change this in the future to return * the absolute path of the processed file. *///from ww w .j ava 2 s . co m @Override public NullWritable getCurrentKey() throws IOException, InterruptedException { return NullWritable.get(); }
From source file:com.marcolotz.lung.reducer.LungReducer.java
License:Creative Commons License
@Override protected void reduce(KeyStructureWritable inputKey, Iterable<ImageMetadata> values, Context context) throws IOException, InterruptedException { ReducedValueWritable reducedValue = new ReducedValueWritable(); Iterator<ImageMetadata> itr = values.iterator(); ArrayList<ImageMetadata> sortedList = new ArrayList<ImageMetadata>(); /*// w ww. j a va 2 s . com * Generates a new list used for sorting * * Careful: This may load all the values for the same key into memory. * In this application this is not a problem due to the value size, but * may cause failures in other types of applications */ while (itr.hasNext()) { /* * One needs a buffer otherwise the iterator will always send the * same element to the list. This is due to the fact the an iterator * in hadoop behaves a little different than an usual one, since * sometimes the data is on disk and sometimes its on memory. */ // Clones iterator content ImageMetadata buffer = new ImageMetadata(itr.next()); sortedList.add(buffer); } // sorted based on the Image Number attribute Collections.sort(sortedList); Iterator<ImageMetadata> sortedItr = sortedList.iterator(); /* Add the values the reduced List, once they are ordered */ while (sortedItr.hasNext()) { reducedValue.addToReducedList(sortedItr.next()); } /* Generates the job output structure */ SeriesDataWritable seriesData = new SeriesDataWritable(inputKey, reducedValue); /* Emits the structure */ context.write(seriesData, NullWritable.get()); }