List of usage examples for org.apache.hadoop.mapred OutputCollector OutputCollector
OutputCollector
From source file:cascading.flow.tez.stream.element.TezBoundaryStage.java
License:Open Source License
protected OutputCollector createOutputCollector() { if (logicalOutputs.size() == 1) return new OldOutputCollector(Util.getFirst(logicalOutputs)); final OutputCollector[] collectors = new OutputCollector[logicalOutputs.size()]; int count = 0; for (LogicalOutput logicalOutput : logicalOutputs) collectors[count++] = new OldOutputCollector(logicalOutput); return new OutputCollector() { @Override//www.j ava2s.co m public void collect(Object key, Object value) throws IOException { for (OutputCollector outputCollector : collectors) outputCollector.collect(key, value); } }; }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapred.java
License:Apache License
/** * Runs mapper for the single split.//from w w w . j av a 2s . c o m * * @param mapOutputAccumulator mapOutputAccumulator to use * @param split split ot run on */ @Override @SuppressWarnings("unchecked") public void runSplit(final MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split, int splitIndex) throws IOException, ClassNotFoundException, InterruptedException { JobConf jobConf = new JobConf(this.jobConf); //Clone JobConf to prevent unexpected task interaction TaskAttemptID taskAttemptID = TaskAttemptID .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex)); ReducerWrapperMapred.updateJobConf(jobConf, taskAttemptID, splitIndex); updateJobWithSplit(jobConf, split); InputFormat inputFormat = jobConf.getInputFormat(); Reporter reporter = Reporter.NULL; //Create RecordReader org.apache.hadoop.mapred.RecordReader<INKEY, INVALUE> recordReader = inputFormat .getRecordReader((InputSplit) split, jobConf, reporter); //Make a mapper org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper; try { mapper = (org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor .newInstance(); mapper.configure(jobConf); } catch (Exception e) { throw new RuntimeException("Cannot instantiate mapper " + mapperConstructor.getDeclaringClass(), e); } //These are to support map only jobs which write output directly to HDFS. final RecordWriter outputRecordWriter; OutputCommitter outputCommitter = null; TaskAttemptContext taskAttemptContext = null; if (mapOnlyJob) { taskAttemptContext = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID); OutputFormat outputFormat = jobConf.getOutputFormat(); FileSystem fs = FileSystem.get(jobConf); outputRecordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat .getRecordWriter(fs, jobConf, ReducerWrapperMapred.getOutputName(splitIndex), Reporter.NULL); outputCommitter = jobConf.getOutputCommitter(); //Create task object so it can handle file format initialization //The MapTask is private in the Hadoop 1.x so we have to go through reflection. try { Class reduceTask = Class.forName("org.apache.hadoop.mapred.MapTask"); Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class, int.class, JobSplit.TaskSplitIndex.class, int.class); reduceTaskConstructor.setAccessible(true); Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, splitIndex, new JobSplit.TaskSplitIndex(), 0); task.setConf(jobConf); task.initialize(jobConf, jobId, Reporter.NULL, false); } catch (Exception e) { throw new IOException("Cannot initialize MapTask", e); } outputCommitter.setupTask(taskAttemptContext); } else { outputRecordWriter = null; } OutputCollector<OUTKEY, OUTVALUE> outputCollector; if (!mapOnlyJob) { outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { try { mapOutputAccumulator.combine(outkey, outvalue); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } }; } else { outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { outputRecordWriter.write(outkey, outvalue); } }; } INKEY key = recordReader.createKey(); INVALUE value = recordReader.createValue(); while (recordReader.next(key, value)) { mapper.map(key, value, outputCollector, reporter); } mapper.close(); recordReader.close(); if (mapOnlyJob) { outputRecordWriter.close(Reporter.NULL); outputCommitter.commitTask(taskAttemptContext); } }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java
License:Apache License
public ReducerWrapperMapred(HServerInvocationParameters invocationParameters, int hadoopPartition, int appId, int region, boolean sort) throws IOException, ClassNotFoundException, InterruptedException { this.invocationParameters = invocationParameters; JobConf jobConf = new JobConf((Configuration) invocationParameters.getConfiguration()); //Clone JobConf, so the temporary settings do not pollute other tasks LOG.info("Starting reducer:" + HadoopInvocationParameters.dumpConfiguration(jobConf)); JobID jobID = (JobID) invocationParameters.getJobId(); this.hadoopPartition = hadoopPartition; hadoopVersionSpecificCode = HadoopVersionSpecificCode.getInstance(invocationParameters.getHadoopVersion(), jobConf);/*ww w . ja v a 2 s. co m*/ TaskAttemptID taskAttemptID = TaskAttemptID .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobID, false, hadoopPartition)); updateJobConf(jobConf, taskAttemptID, region); context = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID); reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(jobConf.getReducerClass(), jobConf); reducer.configure(jobConf); OutputFormat outputFormat = jobConf.getOutputFormat(); FileSystem fs = FileSystem.get(jobConf); recordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat.getRecordWriter(fs, jobConf, getOutputName(hadoopPartition), Reporter.NULL); committer = jobConf.getOutputCommitter(); //Create task object so it can handle file format initialization //The ReduceTask is private in the Hadoop 1.x so we have to go through reflection. try { Class reduceTask = Class.forName("org.apache.hadoop.mapred.ReduceTask"); Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class, int.class, int.class, int.class); reduceTaskConstructor.setAccessible(true); Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, hadoopPartition, 0, 0); task.setConf(jobConf); task.initialize(jobConf, jobID, Reporter.NULL, false); } catch (Exception e) { throw new IOException("Cannot initialize ReduceTask", e); } committer.setupTask(context); Class<INKEY> keyClass = (Class<INKEY>) jobConf.getMapOutputKeyClass(); WritableSerializerDeserializer<INKEY> firstKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); WritableSerializerDeserializer<INKEY> secondKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); Class<INVALUE> valueClass = (Class<INVALUE>) jobConf.getMapOutputValueClass(); WritableSerializerDeserializer<INVALUE> valueSerializer = new WritableSerializerDeserializer<INVALUE>( valueClass, null); DataGridReaderParameters<INKEY, INVALUE> params = new DataGridReaderParameters<INKEY, INVALUE>(region, appId, HServerParameters.getSetting(REDUCE_USEMEMORYMAPPEDFILES, jobConf) > 0, firstKeySerializer, valueSerializer, invocationParameters.getSerializationMode(), secondKeySerializer, keyClass, valueClass, sort, HServerParameters.getSetting(REDUCE_CHUNKSTOREADAHEAD, jobConf), 1024 * HServerParameters.getSetting(REDUCE_INPUTCHUNKSIZE_KB, jobConf), HServerParameters.getSetting(REDUCE_CHUNKREADTIMEOUT, jobConf)); transport = DataGridChunkedCollectionReader.getGridReader(params); outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { recordWriter.write(outkey, outvalue); } }; }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java
License:Apache License
public ReducerWrapperMapred(HServerInvocationParameters invocationParameters, final MapOutputAccumulator<OUTKEY, OUTVALUE> consumer, Class<? extends org.apache.hadoop.mapred.Reducer> combinerClass) throws IOException, ClassNotFoundException, InterruptedException { JobConf jobConf = (JobConf) invocationParameters.getConfiguration(); reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(combinerClass, jobConf); outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override/*from w w w.j av a2 s .c om*/ public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { try { consumer.saveCombineResult(outkey, outvalue); } catch (Exception e) { throw new IOException("Error while saving combined result.", e); } } }; }
From source file:edu.stolaf.cs.wmrserver.streaming.PipeMapRed.java
License:Apache License
void waitOutputThreads() { try {//from w w w .j ava 2s .c o m if (outThread_ == null) { // This happens only when reducer has empty input(So reduce() is not // called at all in this task). If reducer still generates output, // which is very uncommon and we may not have to support this case. // So we don't write this output to HDFS, but we consume/collect // this output just to avoid reducer hanging forever. OutputCollector collector = new OutputCollector() { public void collect(Object key, Object value) throws IOException { //just consume it, no need to write the record anywhere } }; Reporter reporter = Reporter.NULL;//dummy reporter startOutputThreads(collector, reporter); } int exitVal = sim.waitFor(); // how'd it go? if (exitVal != 0) { if (nonZeroExitIsFailure_) { throw new RuntimeException( "PipeMapRed.waitOutputThreads(): subprocess failed with code " + exitVal); } else { logprintln("PipeMapRed.waitOutputThreads(): subprocess exited with code " + exitVal + " in " + PipeMapRed.class.getName()); } } if (outThread_ != null) { outThread_.join(joinDelay_); } if (errThread_ != null) { errThread_.join(joinDelay_); } if (outerrThreadsThrowable != null) { throw new RuntimeException(outerrThreadsThrowable); } } catch (InterruptedException e) { //ignore } }
From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java
License:Open Source License
public static <S1 extends Shape, S2 extends Shape> int SpatialJoin_planeSweepFilterOnly(final List<S1> R, final List<S2> S, final ResultCollector2<S1, S2> output, Reporter reporter) throws IOException { LOG.debug("Start spatial join plan sweep algorithm !!!"); final RectangleID[] Rmbrs = new RectangleID[R.size()]; for (int i = 0; i < R.size(); i++) { Rmbrs[i] = new RectangleID(i, R.get(i).getMBR()); }/*from www . ja v a 2 s .c o m*/ final RectangleID[] Smbrs = new RectangleID[S.size()]; for (int i = 0; i < S.size(); i++) { Smbrs[i] = new RectangleID(i, S.get(i).getMBR()); } final IntWritable count = new IntWritable(); int filterCount = SpatialJoin_rectangles(Rmbrs, Smbrs, new OutputCollector<RectangleID, RectangleID>() { @Override public void collect(RectangleID r1, RectangleID r2) throws IOException { //if (R.get(r1.id).isIntersected(S.get(r2.id))) { if (output != null) output.collect(R.get(r1.id), S.get(r2.id)); count.set(count.get() + 1); //} } }, reporter); LOG.debug("Filtered result size " + filterCount + ", refined result size " + count.get()); return count.get(); }
From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java
License:Open Source License
/** * The general version of self join algorithm which works with arbitrary * shapes. First, it performs a filter step where it finds shapes with * overlapping MBRs. Second, an optional refine step can be executed to * return only shapes which actually overlap. * @param R - input set of shapes//from w ww .j a v a2s .com * @param refine - Whether or not to run a refine step * @param output - output collector where the results are reported * @return - number of pairs returned by the planesweep algorithm * @throws IOException */ public static <S extends Shape> int SelfJoin_planeSweep(final S[] R, boolean refine, final OutputCollector<S, S> output, Progressable reporter) throws IOException { // Use a two-phase filter and refine approach // 1- Use MBRs as a first filter // 2- Use ConvexHull as a second filter // 3- Use the exact shape for refinement final RectangleID[] mbrs = new RectangleID[R.length]; for (int i = 0; i < R.length; i++) { mbrs[i] = new RectangleID(i, R[i].getMBR()); } if (refine) { final IntWritable count = new IntWritable(); int filterCount = SelfJoin_rectangles(mbrs, new OutputCollector<RectangleID, RectangleID>() { @Override public void collect(RectangleID r1, RectangleID r2) throws IOException { if (R[r1.id].isIntersected(R[r2.id])) { if (output != null) output.collect(R[r1.id], R[r2.id]); count.set(count.get() + 1); } } }, reporter); LOG.debug("Filtered result size " + filterCount + ", refined result size " + count.get()); return count.get(); } else { return SelfJoin_rectangles(mbrs, new OutputCollector<RectangleID, RectangleID>() { @Override public void collect(RectangleID r1, RectangleID r2) throws IOException { if (output != null) output.collect(R[r1.id], R[r2.id]); } }, reporter); } }
From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java
License:Open Source License
/** * Group polygons by overlap/*from www .j a va 2 s.com*/ * @param polygons * @param prog * @return * @throws IOException */ public static Geometry[][] groupPolygons(final Geometry[] polygons, final Progressable prog) throws IOException { // Group shapes into overlapping groups long t1 = System.currentTimeMillis(); RectangleID[] mbrs = new RectangleID[polygons.length]; for (int i = 0; i < polygons.length; i++) { Coordinate[] coords = polygons[i].getEnvelope().getCoordinates(); double x1 = Math.min(coords[0].x, coords[2].x); double x2 = Math.max(coords[0].x, coords[2].x); double y1 = Math.min(coords[0].y, coords[2].y); double y2 = Math.max(coords[0].y, coords[2].y); mbrs[i] = new RectangleID(i, x1, y1, x2, y2); } // Parent link of the Set Union Find data structure final int[] parent = new int[mbrs.length]; Arrays.fill(parent, -1); // Group records in clusters by overlapping SelfJoin_rectangles(mbrs, new OutputCollector<RectangleID, RectangleID>() { @Override public void collect(RectangleID r, RectangleID s) throws IOException { int rid = r.id; while (parent[rid] != -1) { int pid = parent[rid]; if (parent[pid] != -1) parent[rid] = parent[pid]; rid = pid; } int sid = s.id; while (parent[sid] != -1) { int pid = parent[sid]; if (parent[pid] != -1) parent[sid] = parent[pid]; sid = pid; } if (rid != sid) parent[rid] = sid; } }, prog); mbrs = null; // Put all records in one cluster as a list Map<Integer, List<Geometry>> groups = new HashMap<Integer, List<Geometry>>(); for (int i = 0; i < parent.length; i++) { int root = parent[i]; if (root == -1) root = i; while (parent[root] != -1) { root = parent[root]; } List<Geometry> group = groups.get(root); if (group == null) { group = new Vector<Geometry>(); groups.put(root, group); } group.add(polygons[i]); } long t2 = System.currentTimeMillis(); Geometry[][] groupedPolygons = new Geometry[groups.size()][]; int counter = 0; for (List<Geometry> group : groups.values()) { groupedPolygons[counter++] = group.toArray(new Geometry[group.size()]); } LOG.debug("Grouped " + parent.length + " shapes into " + groups.size() + " clusters in " + (t2 - t1) / 1000.0 + " seconds"); return groupedPolygons; }
From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java
License:Open Source License
private static long selfJoinLocal(Path in, Path out, OperationsParams params) throws IOException { if (isOneShotReadMode) { // Ensure all objects are read in one shot params.setInt(SpatialSite.MaxBytesInOneRead, -1); params.setInt(SpatialSite.MaxShapesInOneRead, -1); } else {// w w w. j a v a2 s.com params.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead); params.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead); } ShapeArrayInputFormat inputFormat = new ShapeArrayInputFormat(); JobConf job = new JobConf(params); FileInputFormat.addInputPath(job, in); InputSplit[] splits = inputFormat.getSplits(job, 1); FileSystem outFs = out.getFileSystem(params); final PrintStream writer = new PrintStream(outFs.create(out)); // Process all input files long resultSize = 0; for (InputSplit split : splits) { ShapeArrayRecordReader reader = new ShapeArrayRecordReader(job, (FileSplit) split); final Text temp = new Text(); Rectangle key = reader.createKey(); ArrayWritable value = reader.createValue(); if (reader.next(key, value)) { Shape[] writables = (Shape[]) value.get(); resultSize += SpatialAlgorithms.SelfJoin_planeSweep(writables, true, new OutputCollector<Shape, Shape>() { @Override public void collect(Shape r, Shape s) throws IOException { writer.print(r.toText(temp)); writer.print(","); writer.println(s.toText(temp)); } }, null); if (reader.next(key, value)) { throw new RuntimeException("Error! Not all values read in one shot."); } } reader.close(); } writer.close(); return resultSize; }
From source file:it.crs4.pydoop.pipes.PipesReducer.java
License:Apache License
/** * Handle the end of the input by closing down the application. *///from ww w .j av a 2 s .com public void close() throws IOException { // if we haven't started the application, we have nothing to do if (isOk) { OutputCollector<K3, V3> nullCollector = new OutputCollector<K3, V3>() { public void collect(K3 key, V3 value) throws IOException { // NULL } }; startApplication(nullCollector, Reporter.NULL); } try { if (isOk) { application.getDownlink().endOfInput(); } else { // send the abort to the application and let it clean up application.getDownlink().abort(); } LOG.info("waiting for finish"); application.waitForFinish(); LOG.info("got done"); } catch (Throwable t) { application.abort(t); } finally { application.cleanup(); } }