Example usage for org.apache.hadoop.io NullWritable get

List of usage examples for org.apache.hadoop.io NullWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io NullWritable get.

Prototype

public static NullWritable get() 

Source Link

Document

Returns the single instance of this class.

Usage

From source file:com.datasalt.pangool.tuplemr.mapred.TestTupleMRJob.java

License:Apache License

@Test
public void testJobWithNulls()
        throws IOException, TupleMRException, ClassNotFoundException, InterruptedException {
    Configuration conf = getConf();
    String input1 = TestTupleMRJob.class.getCanonicalName() + "-input1";
    String input2 = TestTupleMRJob.class.getCanonicalName() + "-input2";
    String output = TestTupleMRJob.class.getCanonicalName() + "-output";

    final Schema schemaNoNulls = new Schema("NoNulls", Fields.parse("f1:int,f2:string"));
    final Schema schemaNulls = new Schema("Nulls", Fields.parse("f1:int?,f2:string?"));
    Tuple t1 = new Tuple(schemaNoNulls);
    Tuple t2 = new Tuple(schemaNulls);

    t1.set(0, 0);//from   ww w.j  av a2s .  c o m
    t1.set(1, "nn");
    withTupleInput(input1, t1);

    Object tuples[][] = new Object[][] { new Object[] { 0, null }, new Object[] { 0, "n1" },
            new Object[] { null, "n2" } };
    for (Object[] tuple : tuples) {
        t2.set(0, tuple[0]);
        t2.set(1, tuple[1]);
        withTupleInput(input2, t2);
    }

    TupleMRBuilder builder = new TupleMRBuilder(getConf(), "test");
    builder.addTupleInput(new Path(input1), new IdentityTupleMapper());
    builder.addTupleInput(new Path(input2), new IdentityTupleMapper());

    builder.setTupleReducer(new TupleReducer<ITuple, NullWritable>() {
        @Override
        public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
                throws IOException, InterruptedException, TupleMRException {
            int count = 0;
            for (ITuple tuple : tuples) {
                Tuple t = new Tuple(schemaNulls);
                t.set(0, tuple.get(0));
                t.set(1, tuple.get(1));
                collector.write(t, NullWritable.get());
                count++;
            }
            if (group.get(0) == null) {
                assertEquals(1, count);
            } else if (((Integer) group.get(0)) == 0) {
                assertEquals(3, count);
            }
        }
    });
    builder.addIntermediateSchema(schemaNoNulls);
    builder.addIntermediateSchema(schemaNulls);
    builder.setGroupByFields("f1");
    builder.setOrderBy(OrderBy.parse("f1:desc|null_smallest").addSchemaOrder(Criteria.Order.ASC));
    builder.setSpecificOrderBy("NoNulls", OrderBy.parse("f2:asc|null_biggest"));
    builder.setSpecificOrderBy("Nulls", OrderBy.parse("f2:asc|null_biggest"));
    builder.setTupleOutput(new Path(output), schemaNulls);

    Job job = builder.createJob();
    job.setNumReduceTasks(1);
    try {
        assertRun(job);
    } finally {
        builder.cleanUpInstanceFiles();
    }

    final Object expectedOutput[][] = new Object[][] { new Object[] { 0, "nn" }, new Object[] { 0, "n1" },
            new Object[] { 0, null }, new Object[] { null, "n2" } };

    boolean debug = false;
    if (debug) {
        readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() {
            @Override
            public void onTuple(ITuple t) {
                System.out.println(t);
            }
        });
    }

    readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() {
        int i = 0;

        @Override
        public void onTuple(ITuple t) {
            assertEqualsNull(expectedOutput[i][0], t.get(0));
            Object f2 = t.get(1);
            f2 = (f2 != null) ? f2.toString() : f2;
            assertEqualsNull(expectedOutput[i][1], f2);
            i++;
        }
    });

    trash(input1);
    trash(input2);
    trash(output);
}

From source file:com.datasalt.pangool.utils.test.AbstractHadoopTestLibrary.java

License:Apache License

public AbstractHadoopTestLibrary withInput(String input, Object key) throws IOException {
    return withInput(input, key, NullWritable.get());
}

From source file:com.datasalt.pangool.utils.test.AbstractHadoopTestLibrary.java

License:Apache License

public void withOutput(String output, Object key)
        throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
    withOutput(output, key, NullWritable.get());
}

From source file:com.ebay.erl.mobius.core.datajoin.DataJoinKey.java

License:Apache License

private WritableComparable getKey(byte type, DataInputBuffer input) throws IOException {
    if (type == Tuple.NULL_WRITABLE_TYPE)
        return NullWritable.get();
    else if (type == Tuple.TUPLE_TYPE) {
        Tuple newTuple = new Tuple();
        newTuple.readFields(input);//from   w w  w.  j a v a  2  s.co  m
        return newTuple;
    } else {
        WritableComparable w = (WritableComparable) ReflectionUtils.newInstance(Util.getClass(input.readUTF()),
                conf);
        w.readFields(input);
        return w;
    }
}

From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java

License:Apache License

/**
 * Read the cut points from the given IFile.
 * @param fs The file system/*www  .j  av a 2  s. c o m*/
 * @param p The path to read
 * @param keyClass The map output key class
 * @param job The job config
 * @throws IOException
 */
// matching key types enforced by passing in
@SuppressWarnings("unchecked") // map output key class
private K[] readPartitions(FileSystem fs, Path p, Class<K> keyClass, JobConf job) throws IOException {
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, job);
    ArrayList<K> parts = new ArrayList<K>();
    K key = (K) ReflectionUtils.newInstance(keyClass, job);
    NullWritable value = NullWritable.get();
    while (reader.next(key, value)) {
        parts.add(key);
        key = (K) ReflectionUtils.newInstance(keyClass, job);
    }
    reader.close();
    return parts.toArray((K[]) Array.newInstance(keyClass, parts.size()));
}

From source file:com.ebay.erl.mobius.core.mapred.AbstractMobiusMapper.java

License:Apache License

protected void outputRecords(Tuple key, Tuple value,
        OutputCollector<WritableComparable<?>, WritableComparable<?>> output) throws IOException {
    if (this._IS_MAP_ONLY_JOB) {
        // map only job, key is not needed as no join is required.
        output.collect(NullWritable.get(), value);
    } else {/*  w  w w  . j  a va2s .c o m*/
        if (key == null) {
            // should never happen, this is to perform join/group by, but there
            // is no key
            throw new IllegalArgumentException("key for dataset: " + this.getDatasetID()
                    + " cannot be empty when performing join/group by.");
        }
        output.collect(key, value);
    }
}

From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java

License:Apache License

private static void writePartitionFile(JobConf job, Sampler sampler) {
    try {// w  w w.j  av  a2s . c  o m
        ////////////////////////////////////////////////
        // first, getting samples from the data sources
        ////////////////////////////////////////////////
        LOGGER.info("Running local sampling for job [" + job.getJobName() + "]");
        InputFormat inf = job.getInputFormat();
        Object[] samples = sampler.getSample(inf, job);
        LOGGER.info("Samples retrieved, sorting...");

        ////////////////////////////////////////////////
        // sort the samples
        ////////////////////////////////////////////////
        RawComparator comparator = job.getOutputKeyComparator();
        Arrays.sort(samples, comparator);

        if (job.getBoolean("mobius.print.sample", false)) {
            PrintWriter pw = new PrintWriter(
                    new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(
                            new File(job.get("mobius.sample.file", "./samples.txt.gz")))))));
            for (Object obj : samples) {
                pw.println(obj);
            }
            pw.flush();
            pw.close();
        }

        ////////////////////////////////////////////////
        // start to write partition files
        ////////////////////////////////////////////////

        FileSystem fs = FileSystem.get(job);
        Path partitionFile = fs.makeQualified(new Path(TotalOrderPartitioner.getPartitionFile(job)));
        while (fs.exists(partitionFile)) {
            partitionFile = new Path(partitionFile.toString() + "." + System.currentTimeMillis());
        }
        fs.deleteOnExit(partitionFile);
        TotalOrderPartitioner.setPartitionFile(job, partitionFile);
        LOGGER.info("write partition file to:" + partitionFile.toString());

        int reducersNbr = job.getNumReduceTasks();
        Set<Object> wroteSamples = new HashSet<Object>();

        SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, partitionFile, Tuple.class,
                NullWritable.class);

        float avgReduceSize = samples.length / reducersNbr;

        int lastBegin = 0;
        for (int i = 0; i < samples.length;) {
            // trying to distribute the load for every reducer evenly,
            // dividing the <code>samples</code> into a set of blocks
            // separated by boundaries, objects that selected from the
            // <code>samples</code> array, and each blocks should have
            // about the same size.

            // find the last index of element that equals to samples[i], as
            // such element might appear multiple times in the samples.
            int upperBound = Util.findUpperBound(samples, samples[i], comparator);

            int lowerBound = i;//Util.findLowerBound(samples, samples[i], comparator);

            // the repeat time of samples[i], if the key itself is too big
            // select it as boundary
            int currentElemSize = upperBound - lowerBound + 1;

            if (currentElemSize > avgReduceSize * 2) // greater than two times of average reducer size
            {
                // the current element is too big, greater than
                // two times of the <code>avgReduceSize</code>, 
                // put itself as boundary
                writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get());
                wroteSamples.add(((DataJoinKey) samples[i]).getKey());
                //pw.println(samples[i]);

                // immediate put the next element to the boundary,
                // the next element starts at <code> upperBound+1
                // </code>, to prevent the current one consume even 
                // more.
                if (upperBound + 1 < samples.length) {
                    writer.append(((DataJoinKey) samples[upperBound + 1]).getKey(), NullWritable.get());
                    wroteSamples.add(((DataJoinKey) samples[upperBound + 1]).getKey());
                    //pw.println(samples[upperBound+1]);

                    // move on to the next element of <code>samples[upperBound+1]/code>
                    lastBegin = Util.findUpperBound(samples, samples[upperBound + 1], comparator) + 1;
                    i = lastBegin;
                } else {
                    break;
                }
            } else {
                // current element is small enough to be consider
                // with previous group
                int size = upperBound - lastBegin;
                if (size > avgReduceSize) {
                    // by including the current elements, we have
                    // found a block that's big enough, select it
                    // as boundary
                    writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get());
                    wroteSamples.add(((DataJoinKey) samples[i]).getKey());
                    //pw.println(samples[i]);

                    i = upperBound + 1;
                    lastBegin = i;
                } else {
                    i = upperBound + 1;
                }
            }
        }

        writer.close();

        // if the number of wrote samples doesn't equals to number of
        // reducer minus one, then it means the key spaces is too small
        // hence TotalOrderPartitioner won't work, it works only if 
        // the partition boundaries are distinct.
        //
        // we need to change the number of reducers
        if (wroteSamples.size() + 1 != reducersNbr) {
            LOGGER.info("Write complete, but key space is too small, sample size=" + wroteSamples.size()
                    + ", reducer size:" + (reducersNbr));
            LOGGER.info("Set the reducer size to:" + (wroteSamples.size() + 1));

            // add 1 because the wrote samples define boundary, ex, if
            // the sample size is two with two element [300, 1000], then 
            // there should be 3 reducers, one for handling i<300, one 
            // for n300<=i<1000, and another one for 1000<=i
            job.setNumReduceTasks((wroteSamples.size() + 1));
        }

        samples = null;
    } catch (IOException e) {
        LOGGER.error(e.getMessage(), e);
        throw new RuntimeException(e);
    }
}

From source file:com.ebay.erl.mobius.core.mapred.DefaultMobiusReducer.java

License:Apache License

private void output(Tuple aTuple, OutputCollector<NullWritable, WritableComparable<?>> output,
        Reporter reporter) throws IOException {
    aTuple.setToStringOrdering(this.outputColumnNames);
    if (this._persistantCriteria != null) {
        if (this._persistantCriteria.accept(aTuple, this.conf)) {
            output.collect(NullWritable.get(), aTuple);
            reporter.getCounter("Join/Grouping Records", "EMITTED").increment(1);
        } else {//from  w  w  w  .j  av a 2 s.co  m
            reporter.getCounter("Join/Grouping Records", "FILTERED").increment(1);
        }
    } else {
        output.collect(NullWritable.get(), aTuple);
        reporter.getCounter("Join/Grouping Records", "EMITTED").increment(1);
    }
}

From source file:com.ebay.erl.mobius.core.mapred.TotalSortReducer.java

License:Apache License

/**
 * reduce()//from   w ww .j a va  2 s  .c om
 * <p>
 * 
 * Output key is {@link org.apache.hadoop.io.NullWritable} and output 
 * value is {@link com.ebay.erl.mobius.core.model.Tuple}
 */
@Override
public void joinreduce(Tuple key, DataJoinValueGroup<Tuple> values,
        OutputCollector<NullWritable, WritableComparable<?>> output, Reporter reporter) throws IOException {
    if (values.hasNext()) {
        Byte datasetID = values.nextDatasetID();
        Iterator<Tuple> valuesToBeOutput = values.next();
        while (valuesToBeOutput.hasNext()) {
            Tuple outTuple = new Tuple();

            Tuple aTuple = valuesToBeOutput.next();
            aTuple.setSchema(this.getSchemaByDatasetID(datasetID));

            // make the column output ordering the same as
            // the <code>_projections</code> ordering.
            for (Projectable aFunc : this._projections) {
                String name = ((Column) aFunc).getInputColumnName();
                outTuple.insert(name, aTuple.get(name));
            }
            output.collect(NullWritable.get(), outTuple);
        }
    }
}

From source file:com.ebay.erl.mobius.core.model.ReadFieldImpl.java

License:Apache License

@Override
protected Void on_null_writable() throws IOException {
    this.values.add(NullWritable.get());
    return null;
}