List of usage examples for org.apache.hadoop.io NullWritable get
public static NullWritable get()
From source file:com.datasalt.pangool.tuplemr.mapred.TestTupleMRJob.java
License:Apache License
@Test public void testJobWithNulls() throws IOException, TupleMRException, ClassNotFoundException, InterruptedException { Configuration conf = getConf(); String input1 = TestTupleMRJob.class.getCanonicalName() + "-input1"; String input2 = TestTupleMRJob.class.getCanonicalName() + "-input2"; String output = TestTupleMRJob.class.getCanonicalName() + "-output"; final Schema schemaNoNulls = new Schema("NoNulls", Fields.parse("f1:int,f2:string")); final Schema schemaNulls = new Schema("Nulls", Fields.parse("f1:int?,f2:string?")); Tuple t1 = new Tuple(schemaNoNulls); Tuple t2 = new Tuple(schemaNulls); t1.set(0, 0);//from ww w.j av a2s . c o m t1.set(1, "nn"); withTupleInput(input1, t1); Object tuples[][] = new Object[][] { new Object[] { 0, null }, new Object[] { 0, "n1" }, new Object[] { null, "n2" } }; for (Object[] tuple : tuples) { t2.set(0, tuple[0]); t2.set(1, tuple[1]); withTupleInput(input2, t2); } TupleMRBuilder builder = new TupleMRBuilder(getConf(), "test"); builder.addTupleInput(new Path(input1), new IdentityTupleMapper()); builder.addTupleInput(new Path(input2), new IdentityTupleMapper()); builder.setTupleReducer(new TupleReducer<ITuple, NullWritable>() { @Override public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector) throws IOException, InterruptedException, TupleMRException { int count = 0; for (ITuple tuple : tuples) { Tuple t = new Tuple(schemaNulls); t.set(0, tuple.get(0)); t.set(1, tuple.get(1)); collector.write(t, NullWritable.get()); count++; } if (group.get(0) == null) { assertEquals(1, count); } else if (((Integer) group.get(0)) == 0) { assertEquals(3, count); } } }); builder.addIntermediateSchema(schemaNoNulls); builder.addIntermediateSchema(schemaNulls); builder.setGroupByFields("f1"); builder.setOrderBy(OrderBy.parse("f1:desc|null_smallest").addSchemaOrder(Criteria.Order.ASC)); builder.setSpecificOrderBy("NoNulls", OrderBy.parse("f2:asc|null_biggest")); builder.setSpecificOrderBy("Nulls", OrderBy.parse("f2:asc|null_biggest")); builder.setTupleOutput(new Path(output), schemaNulls); Job job = builder.createJob(); job.setNumReduceTasks(1); try { assertRun(job); } finally { builder.cleanUpInstanceFiles(); } final Object expectedOutput[][] = new Object[][] { new Object[] { 0, "nn" }, new Object[] { 0, "n1" }, new Object[] { 0, null }, new Object[] { null, "n2" } }; boolean debug = false; if (debug) { readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() { @Override public void onTuple(ITuple t) { System.out.println(t); } }); } readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() { int i = 0; @Override public void onTuple(ITuple t) { assertEqualsNull(expectedOutput[i][0], t.get(0)); Object f2 = t.get(1); f2 = (f2 != null) ? f2.toString() : f2; assertEqualsNull(expectedOutput[i][1], f2); i++; } }); trash(input1); trash(input2); trash(output); }
From source file:com.datasalt.pangool.utils.test.AbstractHadoopTestLibrary.java
License:Apache License
public AbstractHadoopTestLibrary withInput(String input, Object key) throws IOException { return withInput(input, key, NullWritable.get()); }
From source file:com.datasalt.pangool.utils.test.AbstractHadoopTestLibrary.java
License:Apache License
public void withOutput(String output, Object key) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { withOutput(output, key, NullWritable.get()); }
From source file:com.ebay.erl.mobius.core.datajoin.DataJoinKey.java
License:Apache License
private WritableComparable getKey(byte type, DataInputBuffer input) throws IOException { if (type == Tuple.NULL_WRITABLE_TYPE) return NullWritable.get(); else if (type == Tuple.TUPLE_TYPE) { Tuple newTuple = new Tuple(); newTuple.readFields(input);//from w w w. j a v a 2 s.co m return newTuple; } else { WritableComparable w = (WritableComparable) ReflectionUtils.newInstance(Util.getClass(input.readUTF()), conf); w.readFields(input); return w; } }
From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java
License:Apache License
/** * Read the cut points from the given IFile. * @param fs The file system/*www .j av a 2 s. c o m*/ * @param p The path to read * @param keyClass The map output key class * @param job The job config * @throws IOException */ // matching key types enforced by passing in @SuppressWarnings("unchecked") // map output key class private K[] readPartitions(FileSystem fs, Path p, Class<K> keyClass, JobConf job) throws IOException { SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, job); ArrayList<K> parts = new ArrayList<K>(); K key = (K) ReflectionUtils.newInstance(keyClass, job); NullWritable value = NullWritable.get(); while (reader.next(key, value)) { parts.add(key); key = (K) ReflectionUtils.newInstance(keyClass, job); } reader.close(); return parts.toArray((K[]) Array.newInstance(keyClass, parts.size())); }
From source file:com.ebay.erl.mobius.core.mapred.AbstractMobiusMapper.java
License:Apache License
protected void outputRecords(Tuple key, Tuple value, OutputCollector<WritableComparable<?>, WritableComparable<?>> output) throws IOException { if (this._IS_MAP_ONLY_JOB) { // map only job, key is not needed as no join is required. output.collect(NullWritable.get(), value); } else {/* w w w . j a va2s .c o m*/ if (key == null) { // should never happen, this is to perform join/group by, but there // is no key throw new IllegalArgumentException("key for dataset: " + this.getDatasetID() + " cannot be empty when performing join/group by."); } output.collect(key, value); } }
From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java
License:Apache License
private static void writePartitionFile(JobConf job, Sampler sampler) { try {// w w w.j av a2s . c o m //////////////////////////////////////////////// // first, getting samples from the data sources //////////////////////////////////////////////// LOGGER.info("Running local sampling for job [" + job.getJobName() + "]"); InputFormat inf = job.getInputFormat(); Object[] samples = sampler.getSample(inf, job); LOGGER.info("Samples retrieved, sorting..."); //////////////////////////////////////////////// // sort the samples //////////////////////////////////////////////// RawComparator comparator = job.getOutputKeyComparator(); Arrays.sort(samples, comparator); if (job.getBoolean("mobius.print.sample", false)) { PrintWriter pw = new PrintWriter( new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream( new File(job.get("mobius.sample.file", "./samples.txt.gz"))))))); for (Object obj : samples) { pw.println(obj); } pw.flush(); pw.close(); } //////////////////////////////////////////////// // start to write partition files //////////////////////////////////////////////// FileSystem fs = FileSystem.get(job); Path partitionFile = fs.makeQualified(new Path(TotalOrderPartitioner.getPartitionFile(job))); while (fs.exists(partitionFile)) { partitionFile = new Path(partitionFile.toString() + "." + System.currentTimeMillis()); } fs.deleteOnExit(partitionFile); TotalOrderPartitioner.setPartitionFile(job, partitionFile); LOGGER.info("write partition file to:" + partitionFile.toString()); int reducersNbr = job.getNumReduceTasks(); Set<Object> wroteSamples = new HashSet<Object>(); SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, partitionFile, Tuple.class, NullWritable.class); float avgReduceSize = samples.length / reducersNbr; int lastBegin = 0; for (int i = 0; i < samples.length;) { // trying to distribute the load for every reducer evenly, // dividing the <code>samples</code> into a set of blocks // separated by boundaries, objects that selected from the // <code>samples</code> array, and each blocks should have // about the same size. // find the last index of element that equals to samples[i], as // such element might appear multiple times in the samples. int upperBound = Util.findUpperBound(samples, samples[i], comparator); int lowerBound = i;//Util.findLowerBound(samples, samples[i], comparator); // the repeat time of samples[i], if the key itself is too big // select it as boundary int currentElemSize = upperBound - lowerBound + 1; if (currentElemSize > avgReduceSize * 2) // greater than two times of average reducer size { // the current element is too big, greater than // two times of the <code>avgReduceSize</code>, // put itself as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); // immediate put the next element to the boundary, // the next element starts at <code> upperBound+1 // </code>, to prevent the current one consume even // more. if (upperBound + 1 < samples.length) { writer.append(((DataJoinKey) samples[upperBound + 1]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[upperBound + 1]).getKey()); //pw.println(samples[upperBound+1]); // move on to the next element of <code>samples[upperBound+1]/code> lastBegin = Util.findUpperBound(samples, samples[upperBound + 1], comparator) + 1; i = lastBegin; } else { break; } } else { // current element is small enough to be consider // with previous group int size = upperBound - lastBegin; if (size > avgReduceSize) { // by including the current elements, we have // found a block that's big enough, select it // as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); i = upperBound + 1; lastBegin = i; } else { i = upperBound + 1; } } } writer.close(); // if the number of wrote samples doesn't equals to number of // reducer minus one, then it means the key spaces is too small // hence TotalOrderPartitioner won't work, it works only if // the partition boundaries are distinct. // // we need to change the number of reducers if (wroteSamples.size() + 1 != reducersNbr) { LOGGER.info("Write complete, but key space is too small, sample size=" + wroteSamples.size() + ", reducer size:" + (reducersNbr)); LOGGER.info("Set the reducer size to:" + (wroteSamples.size() + 1)); // add 1 because the wrote samples define boundary, ex, if // the sample size is two with two element [300, 1000], then // there should be 3 reducers, one for handling i<300, one // for n300<=i<1000, and another one for 1000<=i job.setNumReduceTasks((wroteSamples.size() + 1)); } samples = null; } catch (IOException e) { LOGGER.error(e.getMessage(), e); throw new RuntimeException(e); } }
From source file:com.ebay.erl.mobius.core.mapred.DefaultMobiusReducer.java
License:Apache License
private void output(Tuple aTuple, OutputCollector<NullWritable, WritableComparable<?>> output, Reporter reporter) throws IOException { aTuple.setToStringOrdering(this.outputColumnNames); if (this._persistantCriteria != null) { if (this._persistantCriteria.accept(aTuple, this.conf)) { output.collect(NullWritable.get(), aTuple); reporter.getCounter("Join/Grouping Records", "EMITTED").increment(1); } else {//from w w w .j av a 2 s.co m reporter.getCounter("Join/Grouping Records", "FILTERED").increment(1); } } else { output.collect(NullWritable.get(), aTuple); reporter.getCounter("Join/Grouping Records", "EMITTED").increment(1); } }
From source file:com.ebay.erl.mobius.core.mapred.TotalSortReducer.java
License:Apache License
/** * reduce()//from w ww .j a va 2 s .c om * <p> * * Output key is {@link org.apache.hadoop.io.NullWritable} and output * value is {@link com.ebay.erl.mobius.core.model.Tuple} */ @Override public void joinreduce(Tuple key, DataJoinValueGroup<Tuple> values, OutputCollector<NullWritable, WritableComparable<?>> output, Reporter reporter) throws IOException { if (values.hasNext()) { Byte datasetID = values.nextDatasetID(); Iterator<Tuple> valuesToBeOutput = values.next(); while (valuesToBeOutput.hasNext()) { Tuple outTuple = new Tuple(); Tuple aTuple = valuesToBeOutput.next(); aTuple.setSchema(this.getSchemaByDatasetID(datasetID)); // make the column output ordering the same as // the <code>_projections</code> ordering. for (Projectable aFunc : this._projections) { String name = ((Column) aFunc).getInputColumnName(); outTuple.insert(name, aTuple.get(name)); } output.collect(NullWritable.get(), outTuple); } } }
From source file:com.ebay.erl.mobius.core.model.ReadFieldImpl.java
License:Apache License
@Override protected Void on_null_writable() throws IOException { this.values.add(NullWritable.get()); return null; }