Example usage for org.apache.hadoop.io NullWritable get

List of usage examples for org.apache.hadoop.io NullWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io NullWritable get.

Prototype

public static NullWritable get() 

Source Link

Document

Returns the single instance of this class.

Usage

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java

License:Apache License

@Override
public void putNext(Tuple t) throws IOException {
    try {//  w  ww.  j a va  2 s .c om
        this.writer.write(NullWritable.get(), t.getAll().size() == 1 ? t.get(0) : t);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

From source file:com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseTest.java

License:Apache License

@Test
public void testAggregationPhase() throws Exception {

    int recordCount = 0;
    List<GenericRecord> inputRecords = generateTestMapperData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);// www .j  a  va  2s  .c  om
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }

    List<Pair<BytesWritable, BytesWritable>> mapResult = mapDriver.run();
    Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount, mapResult.size());

    AggregationPhaseMapOutputKey keyWrapper = AggregationPhaseMapOutputKey
            .fromBytes(mapResult.get(0).getFirst().getBytes());
    Assert.assertEquals(406058, keyWrapper.getTime());
    keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(1).getFirst().getBytes());
    Assert.assertEquals(406058, keyWrapper.getTime());
    keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(2).getFirst().getBytes());
    Assert.assertEquals(406059, keyWrapper.getTime());

    List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(mapResult);
    reduceDriver.addAll(reduceInput);

    List<Pair<AvroKey<GenericRecord>, NullWritable>> reduceResult = reduceDriver.run();
    Assert.assertEquals("Incorrect number of records returned by aggregation reducer", 2, reduceResult.size());

    GenericRecord record = reduceResult.get(0).getFirst().datum();
    List<String> dimensionsExpected = Lists.newArrayList("abc1", "pqr1", "xyz1");
    List<String> dimensionsActual = getDimensionsFromRecord(record);
    Assert.assertEquals(dimensionsExpected, dimensionsActual);
    List<Integer> metricsExpected = Lists.newArrayList(200, 40);
    List<Integer> metricsActual = getMetricsFromRecord(record);
    Assert.assertEquals(metricsExpected, metricsActual);
    Assert.assertEquals(406058, (long) record.get("hoursSinceEpoch"));

    record = reduceResult.get(1).getFirst().datum();
    dimensionsExpected = Lists.newArrayList("abc2", "pqr2", "xyz2");
    dimensionsActual = getDimensionsFromRecord(record);
    Assert.assertEquals(dimensionsExpected, dimensionsActual);
    metricsExpected = Lists.newArrayList(10, 2);
    metricsActual = getMetricsFromRecord(record);
    Assert.assertEquals(metricsExpected, metricsActual);
    Assert.assertEquals(406059, (long) record.get("hoursSinceEpoch"));
}

From source file:com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnNoTransformationTest.java

License:Apache License

@Test
public void testTopKColumnTransformationPhase() throws Exception {
    int recordCount = 0;

    List<GenericRecord> inputRecords = generateTestData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);/*  w ww . j  a v a 2  s.  c o  m*/
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }

    resetAvroSerialization();
    List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run();
    Assert.assertEquals(recordCount, result.size());

    for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) {
        GenericRecord datum = pair.getFirst().datum();
        System.out.println(datum.getSchema().getFields().size());
        Assert.assertEquals(
                "Input records must contain same number of fields as output record, when schemas are not transformed",
                datum.getSchema().getFields().size(), 6);
    }
}

From source file:com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationTest.java

License:Apache License

@Test
public void testTopKColumnTransformationPhase() throws Exception {
    int recordCount = 0;

    List<GenericRecord> inputRecords = generateTestData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);/* w ww.  j ava 2 s  .c  o m*/
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }

    resetAvroSerialization();
    List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run();
    Assert.assertEquals(recordCount, result.size());

    for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) {
        GenericRecord datum = pair.getFirst().datum();
        Assert.assertEquals("TopKTransformationJob did not add new column for topk column",
                datum.getSchema().getField("d2_topk") != null, true);
        String d2 = (String) datum.get("d2");
        String d2_topk = (String) datum.get("d2_topk");
        Assert.assertEquals("Incorrect topk column transformation",
                (d2_topk.equals("other") && d2.equals("pqr1")) || (d2_topk.equals("pqr2") && d2.equals("pqr2")),
                true);
    }
}

From source file:com.linkedin.thirdeye.hadoop.topk.TopkPhaseTest.java

License:Apache License

@Test
public void testTopKColumnTransformationPhase() throws Exception {

    int recordCount = 0;
    List<GenericRecord> inputRecords = generateTestMapperData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);// ww  w.j  a v a2s  . c o m
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }

    List<Pair<BytesWritable, BytesWritable>> result = mapDriver.run();
    // for each record, we emit 2 records per dimension:
    // once for actual value of dimension, once for ALL,ALL
    Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount * 3 * 2, result.size());

    Map<String, Integer> counts = new HashMap<>();
    for (Pair<BytesWritable, BytesWritable> pair : result) {
        TopKPhaseMapOutputKey key = TopKPhaseMapOutputKey.fromBytes(pair.getFirst().getBytes());
        String dimensionName = key.getDimensionName();
        Integer count = counts.get(dimensionName);
        if (count == null) {
            count = 0;
        }
        counts.put(dimensionName, count + 1);
    }
    Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d1"));
    Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d2"));
    Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d3"));
    Assert.assertEquals("Incorrect number of records emitted from map", 6, (int) counts.get("0"));

    List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(result);
    reduceDriver.addAll(reduceInput);
    reduceDriver.run();

    File topKFile = new File(outputPath, ThirdEyeConstants.TOPK_VALUES_FILE);
    Assert.assertTrue("Topk file failed to generate!", topKFile.exists());
    TopKDimensionValues topk = OBJECT_MAPPER.readValue(new FileInputStream(topKFile),
            TopKDimensionValues.class);
    Map<String, Set<String>> topkMap = topk.getTopKDimensions();
    Assert.assertEquals("Incorrect topk object", topkMap.size(), 1);
    Assert.assertEquals("Incorrect topk values in topk object", Sets.newHashSet("pqr1"), topkMap.get("d2"));
    Assert.assertEquals("Incorrect whitelist values in topk object", null, topkMap.get("d3"));
}

From source file:com.linkedin.whiteelephant.mapreduce.MyAvroMultipleOutputs.java

License:Apache License

/**
 * Write key and value to the namedOutput.
 *
 * Output path is a unique file generated for the namedOutput.
 * For example, {namedOutput}-(m|r)-{part-number}
 * /*from   w  ww  .  ja v  a  2s. co m*/
 * @param namedOutput the named output name
 * @param key         the key , value is NullWritable
 */
@SuppressWarnings("unchecked")
public void write(String namedOutput, Object key) throws IOException, InterruptedException {
    write(namedOutput, key, NullWritable.get(), namedOutput);
}

From source file:com.m6d.filecrush.crush.CrushReducer.java

License:Apache License

@Override
public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector,
        Reporter reporter) throws IOException {
    String bucket = bucketId.toString();

    String dirName = bucket.substring(0, bucket.lastIndexOf('-'));

    int idx = findMatcher(dirName);

    String outputFileName = calculateOutputFile(idx, dirName);

    /*//from   w  w w  .  j a  v  a  2 s  .c  o m
     * Don't need to separate the paths because the output file name is already absolute.
     */
    valueOut.set(outDirPath + outputFileName);

    LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName));

    /*
     * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir.
     */
    RecordWriter<Object, Object> sink = null;
    FileSinkOperator.RecordWriter parquetSink = null;
    Exception rootCause = null;

    Void voidKey = null;
    Object key = null;
    Object value = null;

    String schemaSignature = null;
    String columns = null;
    String columnsTypes = null;
    Properties jobProperties = new Properties();
    boolean firstFile = true;

    try {
        while (null == rootCause && values.hasNext()) {
            Text srcFile = values.next();
            Path inputPath = new Path(srcFile.toString());
            RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter);

            if (firstFile) {
                firstFile = false;

                key = reader.createKey();
                if (null == key)
                    key = NullWritable.get();
                value = reader.createValue();

                if (AvroContainerInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) {
                    schemaSignature = getAvroFileSchemaString(job, inputPath);
                    job.set("avro.schema.literal", schemaSignature);
                } else if (MapredParquetInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) {
                    MessageType schema = getParquetFileSchema(job, inputPath);
                    List<Type> fieldsFromSchema = schema.getFields();
                    for (Type field : fieldsFromSchema) {
                        if (field.getOriginalType() != null) {
                            if (StringUtils.equals(field.getOriginalType().toString(), "DECIMAL")) {
                                String primitiveType = field.asPrimitiveType().toString();
                                int loc = primitiveType.indexOf("DECIMAL");
                                int start = loc + 7;
                                int end = primitiveType.indexOf(")", loc) + 1;
                                String ps = primitiveType.substring(start, end);
                                if (!decimalTypesHashMap.containsKey(ps)) {
                                    decimalTypesHashMap.put(field.getName().toString(), ps);
                                }
                            }
                        }
                    }
                    schemaSignature = getParquetFileSchemaString(job, inputPath);
                    StringBuilder columnsSb = new StringBuilder();
                    StringBuilder columnsTypesSb = new StringBuilder();
                    boolean firstColumn = true;
                    for (ColumnDescriptor col : schema.getColumns()) {
                        if (firstColumn) {
                            firstColumn = false;
                        } else {
                            columnsSb.append(",");
                            columnsTypesSb.append(",");
                        }
                        columnsSb.append(col.getPath()[0]);
                        String typeName = col.getType().toString();
                        if ("INT96".equals(typeName))
                            typeName = "timestamp";
                        else if ("INT64".equals(typeName))
                            typeName = "bigint";
                        else if ("INT32".equals(typeName))
                            typeName = "int";
                        else if ("INT16".equals(typeName))
                            typeName = "smallint";
                        else if ("INT8".equals(typeName))
                            typeName = "tinyint";
                        else if ("BINARY".equals(typeName))
                            typeName = "string";
                        else if ("BOOLEAN".equals(typeName))
                            typeName = "boolean";
                        else if ("DOUBLE".equals(typeName))
                            typeName = "double";
                        else if ("FLOAT".equals(typeName))
                            typeName = "float";
                        else if (typeName.startsWith("FIXED_LEN_BYTE_ARRAY")) {
                            String column = col.toString();
                            int start = column.indexOf('[') + 1;
                            int end = column.indexOf(']');
                            String fieldName = column.substring(start, end);
                            String lookupVal = decimalTypesHashMap.get(fieldName);
                            LOG.info("final string: decimal" + lookupVal);
                            typeName = "decimal" + lookupVal;
                        }
                        columnsTypesSb.append(typeName);
                    }
                    columns = columnsSb.toString();
                    columnsTypes = columnsTypesSb.toString();
                    jobProperties.put(IOConstants.COLUMNS, columns);
                    jobProperties.put(IOConstants.COLUMNS_TYPES, columnsTypes);
                    parquetSerDe = new ParquetHiveSerDe();
                    parquetSerDe.initialize(job, jobProperties);
                } else {
                    schemaSignature = key.getClass().getName() + ":" + value.getClass().getName();
                }

                /*
                 * Set the key and value class in the conf, which the output format uses to get type information.
                 */
                job.setOutputKeyClass(key.getClass());
                job.setOutputValueClass(value.getClass());

                /*
                 * Output file name is absolute so we can just add it to the crush prefix.
                 */
                if (MapredParquetOutputFormat.class.isAssignableFrom(getOutputFormatClass(idx))) {
                    outputFormat = "parquet";
                    parquetSink = createParquetRecordWriter(idx, valueOut.toString(), jobProperties,
                            (Class<? extends org.apache.hadoop.io.Writable>) value.getClass(), reporter);
                } else {
                    outputFormat = getOutputFormatClass(idx).getName();
                    sink = createRecordWriter(idx, valueOut.toString());
                }
            } else { // next files

                /*
                 * Ensure schema signature is the same as the first file's
                 */
                String nextSchemaSignature = null;
                if (AvroContainerInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) {
                    nextSchemaSignature = getAvroFileSchemaString(job, inputPath);
                } else if (MapredParquetInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) {
                    nextSchemaSignature = getParquetFileSchemaString(job, inputPath);
                } else {
                    Object otherKey = reader.createKey();
                    if (otherKey == null)
                        otherKey = NullWritable.get();
                    nextSchemaSignature = otherKey.getClass().getName() + ":"
                            + reader.createValue().getClass().getName();
                }
                if (!schemaSignature.equals(nextSchemaSignature)) {
                    throw new IllegalArgumentException(
                            format("Heterogeneous schema detected in file %s: [%s] != [%s]", inputPath,
                                    nextSchemaSignature, schemaSignature));
                }
            }

            boolean ret;
            if ("parquet".equals(outputFormat))
                ret = reader.next(voidKey, value);
            else
                ret = reader.next(key, value);
            while (ret) {
                if ("text".equals(inputFormat))
                    sink.write(key, null);
                else if (sink != null)
                    sink.write(key, value);
                else {
                    ParquetHiveRecord parquetHiveRecord = new ParquetHiveRecord(value,
                            (StructObjectInspector) parquetSerDe.getObjectInspector());
                    parquetSink.write(parquetHiveRecord);
                }
                reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1);

                if ("parquet".equals(outputFormat))
                    ret = reader.next(voidKey, value);
                else
                    ret = reader.next(key, value);
            }
            //
            /*
             * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir.
             */
            collector.collect(srcFile, valueOut);
            reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1);

            recordNumber++;

            if (reportRecordNumber == recordNumber) {
                reportRecordNumber += reportRecordNumber;

                reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath));
            }
        }
    } catch (Exception e) {
        rootCause = e;
    } finally {
        if (null != sink) {
            try {
                sink.close(reporter);
            } catch (Exception e) {
                if (null == rootCause) {
                    rootCause = e;
                } else {
                    LOG.error("Swallowing exception on close of " + outputFileName, e);
                }
            }
        }
        if (null != parquetSink) {
            try {
                parquetSink.close(false);
            } catch (Exception e) {
                if (null == rootCause) {
                    rootCause = e;
                } else {
                    LOG.error("Swallowing exception on close of " + outputFileName, e);
                }
            }
        }

        /*
         * Let the exception bubble up with a minimum of wrapping.
         */
        if (null != rootCause) {
            if (rootCause instanceof RuntimeException) {
                throw (RuntimeException) rootCause;
            }

            if (rootCause instanceof IOException) {
                throw (IOException) rootCause;
            }

            throw new RuntimeException(rootCause);
        }
    }
}

From source file:com.m6d.hive.protobuf.LongTest.java

License:Apache License

public void testBigDat() throws Exception {
    Path p = new Path(this.ROOT_DIR, "reallybigflat");
    SequenceFile.Writer w = SequenceFile.createWriter(this.getFileSystem(), new Configuration(), p,
            NullWritable.class, Text.class, SequenceFile.CompressionType.BLOCK);

    long startLoad = System.currentTimeMillis();
    int toLoad = load;
    for (int i = 0; i < toLoad; i++) {
        Text t = new Text();
        //t.set("ed\ted@email.com\t1\tjava\tbob\tbob@email.com\t3\tbball");
        t.set(randomString() + "\t" + randomString() + "\t" + randomInt() + "\t" + randomString() + "\t"
                + randomString() + "\t" + randomString() + "\t" + randomInt() + "\t" + randomString());

        w.append(NullWritable.get(), t);
    }/*  ww w.  j ava 2 s . co  m*/
    w.close();
    System.out.println("len " + this.getFileSystem().getFileStatus(p).getLen());
    long endLoad = System.currentTimeMillis();
    System.out.println((endLoad - startLoad) + " time taken loading");

    String jarFile;
    jarFile = KVAsVSeqFileBinaryInputFormat.class.getProtectionDomain().getCodeSource().getLocation().getFile();
    client.execute("add jar " + jarFile);
    client.execute("set hive.aux.jars.path=file:///" + jarFile);

    client.execute("create table  bigtext   " + "(name string, email string , id int , hobby string, "
            + " name1 string, email1 string, id1 int , hobby1 string)"
            + " ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS SEQUENCEFILE");

    client.execute("load data local inpath '" + p.toString() + "' into table bigtext");

    long startQuery = System.currentTimeMillis();
    //client.execute( "select distinct(name) from bigtext");
    //List<String> result = client.fetchAll();
    //Assert.assertEquals("edward", result);
    client.execute("SELECT count(1) FROM bigtext");
    List<String> results = client.fetchAll();
    Assert.assertEquals(toLoad + "", results.get(0));
    long endQuery = System.currentTimeMillis();

    System.out.println((endQuery - startQuery) + " Query time taken");
    client.execute("drop table bigproto");

}

From source file:com.marcolotz.lung.io.inputFormat.MultipleFilesRecordReader.java

License:Creative Commons License

/***
 * There is no key in this case. One can change this in the future to return
 * the absolute path of the processed file.
 *///from  ww w  .j ava 2  s  .  co m
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
    return NullWritable.get();
}

From source file:com.marcolotz.lung.reducer.LungReducer.java

License:Creative Commons License

@Override
protected void reduce(KeyStructureWritable inputKey, Iterable<ImageMetadata> values, Context context)
        throws IOException, InterruptedException {

    ReducedValueWritable reducedValue = new ReducedValueWritable();

    Iterator<ImageMetadata> itr = values.iterator();

    ArrayList<ImageMetadata> sortedList = new ArrayList<ImageMetadata>();

    /*// w ww. j a va 2 s . com
     * Generates a new list used for sorting
     * 
     * Careful: This may load all the values for the same key into memory.
     * In this application this is not a problem due to the value size, but
     * may cause failures in other types of applications
     */
    while (itr.hasNext()) {
        /*
         * One needs a buffer otherwise the iterator will always send the
         * same element to the list. This is due to the fact the an iterator
         * in hadoop behaves a little different than an usual one, since
         * sometimes the data is on disk and sometimes its on memory.
         */

        // Clones iterator content
        ImageMetadata buffer = new ImageMetadata(itr.next());

        sortedList.add(buffer);
    }

    // sorted based on the Image Number attribute
    Collections.sort(sortedList);

    Iterator<ImageMetadata> sortedItr = sortedList.iterator();

    /* Add the values the reduced List, once they are ordered */
    while (sortedItr.hasNext()) {
        reducedValue.addToReducedList(sortedItr.next());
    }

    /* Generates the job output structure */
    SeriesDataWritable seriesData = new SeriesDataWritable(inputKey, reducedValue);

    /* Emits the structure */
    context.write(seriesData, NullWritable.get());
}