Example usage for org.apache.hadoop.mapred OutputCollector OutputCollector

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred OutputCollector OutputCollector.

Prototype

OutputCollector

Source Link

Usage

From source file:cascading.flow.tez.stream.element.TezBoundaryStage.java

License:Open Source License

protected OutputCollector createOutputCollector() {
    if (logicalOutputs.size() == 1)
        return new OldOutputCollector(Util.getFirst(logicalOutputs));

    final OutputCollector[] collectors = new OutputCollector[logicalOutputs.size()];

    int count = 0;
    for (LogicalOutput logicalOutput : logicalOutputs)
        collectors[count++] = new OldOutputCollector(logicalOutput);

    return new OutputCollector() {
        @Override//www.j  ava2s.co  m
        public void collect(Object key, Object value) throws IOException {
            for (OutputCollector outputCollector : collectors)
                outputCollector.collect(key, value);
        }
    };
}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapred.java

License:Apache License

/**
 * Runs mapper for the single split.//from   w w  w . j  av  a 2s  .  c o  m
 *
 * @param mapOutputAccumulator mapOutputAccumulator to use
 * @param split                split ot run on
 */

@Override
@SuppressWarnings("unchecked")
public void runSplit(final MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split,
        int splitIndex) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf jobConf = new JobConf(this.jobConf); //Clone JobConf to prevent unexpected task interaction

    TaskAttemptID taskAttemptID = TaskAttemptID
            .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex));

    ReducerWrapperMapred.updateJobConf(jobConf, taskAttemptID, splitIndex);
    updateJobWithSplit(jobConf, split);

    InputFormat inputFormat = jobConf.getInputFormat();

    Reporter reporter = Reporter.NULL;

    //Create RecordReader
    org.apache.hadoop.mapred.RecordReader<INKEY, INVALUE> recordReader = inputFormat
            .getRecordReader((InputSplit) split, jobConf, reporter);

    //Make a mapper
    org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper;
    try {
        mapper = (org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor
                .newInstance();
        mapper.configure(jobConf);
    } catch (Exception e) {
        throw new RuntimeException("Cannot instantiate mapper " + mapperConstructor.getDeclaringClass(), e);
    }

    //These are to support map only jobs which write output directly to HDFS.
    final RecordWriter outputRecordWriter;
    OutputCommitter outputCommitter = null;
    TaskAttemptContext taskAttemptContext = null;

    if (mapOnlyJob) {

        taskAttemptContext = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID);
        OutputFormat outputFormat = jobConf.getOutputFormat();
        FileSystem fs = FileSystem.get(jobConf);
        outputRecordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat
                .getRecordWriter(fs, jobConf, ReducerWrapperMapred.getOutputName(splitIndex), Reporter.NULL);
        outputCommitter = jobConf.getOutputCommitter();

        //Create task object so it can handle file format initialization
        //The MapTask is private in the Hadoop 1.x so we have to go through reflection.
        try {
            Class reduceTask = Class.forName("org.apache.hadoop.mapred.MapTask");
            Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class,
                    TaskAttemptID.class, int.class, JobSplit.TaskSplitIndex.class, int.class);
            reduceTaskConstructor.setAccessible(true);
            Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, splitIndex,
                    new JobSplit.TaskSplitIndex(), 0);
            task.setConf(jobConf);
            task.initialize(jobConf, jobId, Reporter.NULL, false);
        } catch (Exception e) {
            throw new IOException("Cannot initialize MapTask", e);
        }
        outputCommitter.setupTask(taskAttemptContext);
    } else {
        outputRecordWriter = null;
    }

    OutputCollector<OUTKEY, OUTVALUE> outputCollector;

    if (!mapOnlyJob) {
        outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
            @Override
            public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
                try {
                    mapOutputAccumulator.combine(outkey, outvalue);
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                }
            }
        };
    } else {
        outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
            @Override
            public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
                outputRecordWriter.write(outkey, outvalue);
            }
        };
    }

    INKEY key = recordReader.createKey();
    INVALUE value = recordReader.createValue();

    while (recordReader.next(key, value)) {
        mapper.map(key, value, outputCollector, reporter);
    }
    mapper.close();

    recordReader.close();

    if (mapOnlyJob) {
        outputRecordWriter.close(Reporter.NULL);
        outputCommitter.commitTask(taskAttemptContext);
    }

}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java

License:Apache License

public ReducerWrapperMapred(HServerInvocationParameters invocationParameters, int hadoopPartition, int appId,
        int region, boolean sort) throws IOException, ClassNotFoundException, InterruptedException {
    this.invocationParameters = invocationParameters;
    JobConf jobConf = new JobConf((Configuration) invocationParameters.getConfiguration()); //Clone JobConf, so the temporary settings do not pollute other tasks

    LOG.info("Starting reducer:" + HadoopInvocationParameters.dumpConfiguration(jobConf));

    JobID jobID = (JobID) invocationParameters.getJobId();
    this.hadoopPartition = hadoopPartition;
    hadoopVersionSpecificCode = HadoopVersionSpecificCode.getInstance(invocationParameters.getHadoopVersion(),
            jobConf);/*ww  w  .  ja  v  a  2  s. co  m*/

    TaskAttemptID taskAttemptID = TaskAttemptID
            .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobID, false, hadoopPartition));

    updateJobConf(jobConf, taskAttemptID, region);

    context = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID);

    reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils
            .newInstance(jobConf.getReducerClass(), jobConf);

    reducer.configure(jobConf);

    OutputFormat outputFormat = jobConf.getOutputFormat();

    FileSystem fs = FileSystem.get(jobConf);
    recordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat.getRecordWriter(fs,
            jobConf, getOutputName(hadoopPartition), Reporter.NULL);

    committer = jobConf.getOutputCommitter();

    //Create task object so it can handle file format initialization
    //The ReduceTask is private in the Hadoop 1.x so we have to go through reflection.
    try {
        Class reduceTask = Class.forName("org.apache.hadoop.mapred.ReduceTask");
        Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class,
                int.class, int.class, int.class);
        reduceTaskConstructor.setAccessible(true);
        Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, hadoopPartition, 0, 0);
        task.setConf(jobConf);
        task.initialize(jobConf, jobID, Reporter.NULL, false);
    } catch (Exception e) {
        throw new IOException("Cannot initialize ReduceTask", e);
    }

    committer.setupTask(context);

    Class<INKEY> keyClass = (Class<INKEY>) jobConf.getMapOutputKeyClass();
    WritableSerializerDeserializer<INKEY> firstKeySerializer = new WritableSerializerDeserializer<INKEY>(
            keyClass, null);
    WritableSerializerDeserializer<INKEY> secondKeySerializer = new WritableSerializerDeserializer<INKEY>(
            keyClass, null);
    Class<INVALUE> valueClass = (Class<INVALUE>) jobConf.getMapOutputValueClass();
    WritableSerializerDeserializer<INVALUE> valueSerializer = new WritableSerializerDeserializer<INVALUE>(
            valueClass, null);

    DataGridReaderParameters<INKEY, INVALUE> params = new DataGridReaderParameters<INKEY, INVALUE>(region,
            appId, HServerParameters.getSetting(REDUCE_USEMEMORYMAPPEDFILES, jobConf) > 0, firstKeySerializer,
            valueSerializer, invocationParameters.getSerializationMode(), secondKeySerializer, keyClass,
            valueClass, sort, HServerParameters.getSetting(REDUCE_CHUNKSTOREADAHEAD, jobConf),
            1024 * HServerParameters.getSetting(REDUCE_INPUTCHUNKSIZE_KB, jobConf),
            HServerParameters.getSetting(REDUCE_CHUNKREADTIMEOUT, jobConf));
    transport = DataGridChunkedCollectionReader.getGridReader(params);
    outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
        @Override
        public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
            recordWriter.write(outkey, outvalue);
        }
    };
}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java

License:Apache License

public ReducerWrapperMapred(HServerInvocationParameters invocationParameters,
        final MapOutputAccumulator<OUTKEY, OUTVALUE> consumer,
        Class<? extends org.apache.hadoop.mapred.Reducer> combinerClass)
        throws IOException, ClassNotFoundException, InterruptedException {
    JobConf jobConf = (JobConf) invocationParameters.getConfiguration();

    reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils
            .newInstance(combinerClass, jobConf);

    outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
        @Override/*from  w w w.j av  a2 s .c  om*/
        public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
            try {
                consumer.saveCombineResult(outkey, outvalue);
            } catch (Exception e) {
                throw new IOException("Error while saving combined result.", e);
            }
        }
    };

}

From source file:edu.stolaf.cs.wmrserver.streaming.PipeMapRed.java

License:Apache License

void waitOutputThreads() {
    try {//from w w w  .j  ava  2s  .c o m
        if (outThread_ == null) {
            // This happens only when reducer has empty input(So reduce() is not
            // called at all in this task). If reducer still generates output,
            // which is very uncommon and we may not have to support this case.
            // So we don't write this output to HDFS, but we consume/collect
            // this output just to avoid reducer hanging forever.

            OutputCollector collector = new OutputCollector() {
                public void collect(Object key, Object value) throws IOException {
                    //just consume it, no need to write the record anywhere
                }
            };
            Reporter reporter = Reporter.NULL;//dummy reporter
            startOutputThreads(collector, reporter);
        }
        int exitVal = sim.waitFor();
        // how'd it go?
        if (exitVal != 0) {
            if (nonZeroExitIsFailure_) {
                throw new RuntimeException(
                        "PipeMapRed.waitOutputThreads(): subprocess failed with code " + exitVal);
            } else {
                logprintln("PipeMapRed.waitOutputThreads(): subprocess exited with code " + exitVal + " in "
                        + PipeMapRed.class.getName());
            }
        }
        if (outThread_ != null) {
            outThread_.join(joinDelay_);
        }
        if (errThread_ != null) {
            errThread_.join(joinDelay_);
        }
        if (outerrThreadsThrowable != null) {
            throw new RuntimeException(outerrThreadsThrowable);
        }
    } catch (InterruptedException e) {
        //ignore
    }
}

From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java

License:Open Source License

public static <S1 extends Shape, S2 extends Shape> int SpatialJoin_planeSweepFilterOnly(final List<S1> R,
        final List<S2> S, final ResultCollector2<S1, S2> output, Reporter reporter) throws IOException {

    LOG.debug("Start spatial join plan sweep algorithm !!!");

    final RectangleID[] Rmbrs = new RectangleID[R.size()];
    for (int i = 0; i < R.size(); i++) {
        Rmbrs[i] = new RectangleID(i, R.get(i).getMBR());
    }/*from www . ja  v a  2 s  .c  o m*/
    final RectangleID[] Smbrs = new RectangleID[S.size()];
    for (int i = 0; i < S.size(); i++) {
        Smbrs[i] = new RectangleID(i, S.get(i).getMBR());
    }

    final IntWritable count = new IntWritable();
    int filterCount = SpatialJoin_rectangles(Rmbrs, Smbrs, new OutputCollector<RectangleID, RectangleID>() {
        @Override
        public void collect(RectangleID r1, RectangleID r2) throws IOException {
            //if (R.get(r1.id).isIntersected(S.get(r2.id))) {
            if (output != null)
                output.collect(R.get(r1.id), S.get(r2.id));
            count.set(count.get() + 1);
            //}
        }
    }, reporter);

    LOG.debug("Filtered result size " + filterCount + ", refined result size " + count.get());

    return count.get();
}

From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java

License:Open Source License

/**
 * The general version of self join algorithm which works with arbitrary
 * shapes. First, it performs a filter step where it finds shapes with
 * overlapping MBRs. Second, an optional refine step can be executed to
 * return only shapes which actually overlap.
 * @param R - input set of shapes//from w  ww .j a v  a2s  .com
 * @param refine - Whether or not to run a refine step
 * @param output - output collector where the results are reported
 * @return - number of pairs returned by the planesweep algorithm
 * @throws IOException
 */
public static <S extends Shape> int SelfJoin_planeSweep(final S[] R, boolean refine,
        final OutputCollector<S, S> output, Progressable reporter) throws IOException {
    // Use a two-phase filter and refine approach
    // 1- Use MBRs as a first filter
    // 2- Use ConvexHull as a second filter
    // 3- Use the exact shape for refinement
    final RectangleID[] mbrs = new RectangleID[R.length];
    for (int i = 0; i < R.length; i++) {
        mbrs[i] = new RectangleID(i, R[i].getMBR());
    }

    if (refine) {
        final IntWritable count = new IntWritable();
        int filterCount = SelfJoin_rectangles(mbrs, new OutputCollector<RectangleID, RectangleID>() {
            @Override
            public void collect(RectangleID r1, RectangleID r2) throws IOException {
                if (R[r1.id].isIntersected(R[r2.id])) {
                    if (output != null)
                        output.collect(R[r1.id], R[r2.id]);
                    count.set(count.get() + 1);
                }
            }
        }, reporter);

        LOG.debug("Filtered result size " + filterCount + ", refined result size " + count.get());

        return count.get();
    } else {
        return SelfJoin_rectangles(mbrs, new OutputCollector<RectangleID, RectangleID>() {
            @Override
            public void collect(RectangleID r1, RectangleID r2) throws IOException {
                if (output != null)
                    output.collect(R[r1.id], R[r2.id]);
            }
        }, reporter);
    }
}

From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java

License:Open Source License

/**
 * Group polygons by overlap/*from   www  .j  a va  2  s.com*/
 * @param polygons
 * @param prog
 * @return
 * @throws IOException
 */
public static Geometry[][] groupPolygons(final Geometry[] polygons, final Progressable prog)
        throws IOException {
    // Group shapes into overlapping groups
    long t1 = System.currentTimeMillis();
    RectangleID[] mbrs = new RectangleID[polygons.length];
    for (int i = 0; i < polygons.length; i++) {
        Coordinate[] coords = polygons[i].getEnvelope().getCoordinates();
        double x1 = Math.min(coords[0].x, coords[2].x);
        double x2 = Math.max(coords[0].x, coords[2].x);
        double y1 = Math.min(coords[0].y, coords[2].y);
        double y2 = Math.max(coords[0].y, coords[2].y);

        mbrs[i] = new RectangleID(i, x1, y1, x2, y2);
    }

    // Parent link of the Set Union Find data structure
    final int[] parent = new int[mbrs.length];
    Arrays.fill(parent, -1);

    // Group records in clusters by overlapping
    SelfJoin_rectangles(mbrs, new OutputCollector<RectangleID, RectangleID>() {
        @Override
        public void collect(RectangleID r, RectangleID s) throws IOException {
            int rid = r.id;
            while (parent[rid] != -1) {
                int pid = parent[rid];
                if (parent[pid] != -1)
                    parent[rid] = parent[pid];
                rid = pid;
            }
            int sid = s.id;
            while (parent[sid] != -1) {
                int pid = parent[sid];
                if (parent[pid] != -1)
                    parent[sid] = parent[pid];
                sid = pid;
            }
            if (rid != sid)
                parent[rid] = sid;
        }
    }, prog);
    mbrs = null;
    // Put all records in one cluster as a list
    Map<Integer, List<Geometry>> groups = new HashMap<Integer, List<Geometry>>();
    for (int i = 0; i < parent.length; i++) {
        int root = parent[i];
        if (root == -1)
            root = i;
        while (parent[root] != -1) {
            root = parent[root];
        }
        List<Geometry> group = groups.get(root);
        if (group == null) {
            group = new Vector<Geometry>();
            groups.put(root, group);
        }
        group.add(polygons[i]);
    }
    long t2 = System.currentTimeMillis();

    Geometry[][] groupedPolygons = new Geometry[groups.size()][];
    int counter = 0;
    for (List<Geometry> group : groups.values()) {
        groupedPolygons[counter++] = group.toArray(new Geometry[group.size()]);
    }
    LOG.debug("Grouped " + parent.length + " shapes into " + groups.size() + " clusters in "
            + (t2 - t1) / 1000.0 + " seconds");
    return groupedPolygons;
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

private static long selfJoinLocal(Path in, Path out, OperationsParams params) throws IOException {
    if (isOneShotReadMode) {
        // Ensure all objects are read in one shot
        params.setInt(SpatialSite.MaxBytesInOneRead, -1);
        params.setInt(SpatialSite.MaxShapesInOneRead, -1);
    } else {//  w  w w.  j  a v  a2  s.com
        params.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead);
        params.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead);
    }
    ShapeArrayInputFormat inputFormat = new ShapeArrayInputFormat();
    JobConf job = new JobConf(params);
    FileInputFormat.addInputPath(job, in);
    InputSplit[] splits = inputFormat.getSplits(job, 1);
    FileSystem outFs = out.getFileSystem(params);
    final PrintStream writer = new PrintStream(outFs.create(out));

    // Process all input files
    long resultSize = 0;
    for (InputSplit split : splits) {
        ShapeArrayRecordReader reader = new ShapeArrayRecordReader(job, (FileSplit) split);
        final Text temp = new Text();

        Rectangle key = reader.createKey();
        ArrayWritable value = reader.createValue();
        if (reader.next(key, value)) {
            Shape[] writables = (Shape[]) value.get();
            resultSize += SpatialAlgorithms.SelfJoin_planeSweep(writables, true,
                    new OutputCollector<Shape, Shape>() {
                        @Override
                        public void collect(Shape r, Shape s) throws IOException {
                            writer.print(r.toText(temp));
                            writer.print(",");
                            writer.println(s.toText(temp));
                        }
                    }, null);
            if (reader.next(key, value)) {
                throw new RuntimeException("Error! Not all values read in one shot.");
            }
        }

        reader.close();
    }
    writer.close();

    return resultSize;
}

From source file:it.crs4.pydoop.pipes.PipesReducer.java

License:Apache License

/**
 * Handle the end of the input by closing down the application.
 *///from  ww  w .j  av  a 2 s .com
public void close() throws IOException {
    // if we haven't started the application, we have nothing to do
    if (isOk) {
        OutputCollector<K3, V3> nullCollector = new OutputCollector<K3, V3>() {
            public void collect(K3 key, V3 value) throws IOException {
                // NULL
            }
        };
        startApplication(nullCollector, Reporter.NULL);
    }
    try {
        if (isOk) {
            application.getDownlink().endOfInput();
        } else {
            // send the abort to the application and let it clean up
            application.getDownlink().abort();
        }
        LOG.info("waiting for finish");
        application.waitForFinish();
        LOG.info("got done");
    } catch (Throwable t) {
        application.abort(t);
    } finally {
        application.cleanup();
    }
}