Example usage for org.apache.hadoop.mapred OutputCollector OutputCollector

List of usage examples for org.apache.hadoop.mapred OutputCollector OutputCollector

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred OutputCollector OutputCollector.

Prototype

OutputCollector

Source Link

Usage

From source file:cascading.flow.tez.stream.element.TezBoundaryStage.java

License:Open Source License

protected OutputCollector createOutputCollector() {
    if (logicalOutputs.size() == 1)
        return new OldOutputCollector(Util.getFirst(logicalOutputs));

    final OutputCollector[] collectors = new OutputCollector[logicalOutputs.size()];

    int count = 0;
    for (LogicalOutput logicalOutput : logicalOutputs)
        collectors[count++] = new OldOutputCollector(logicalOutput);

    return new OutputCollector() {
        @Override//www.j  ava2s.co  m
        public void collect(Object key, Object value) throws IOException {
            for (OutputCollector outputCollector : collectors)
                outputCollector.collect(key, value);
        }
    };
}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapred.java

License:Apache License

/**
 * Runs mapper for the single split.//from   w w  w . j  av  a 2s  .  c o  m
 *
 * @param mapOutputAccumulator mapOutputAccumulator to use
 * @param split                split ot run on
 */

@Override
@SuppressWarnings("unchecked")
public void runSplit(final MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split,
        int splitIndex) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf jobConf = new JobConf(this.jobConf); //Clone JobConf to prevent unexpected task interaction

    TaskAttemptID taskAttemptID = TaskAttemptID
            .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex));

    ReducerWrapperMapred.updateJobConf(jobConf, taskAttemptID, splitIndex);
    updateJobWithSplit(jobConf, split);

    InputFormat inputFormat = jobConf.getInputFormat();

    Reporter reporter = Reporter.NULL;

    //Create RecordReader
    org.apache.hadoop.mapred.RecordReader<INKEY, INVALUE> recordReader = inputFormat
            .getRecordReader((InputSplit) split, jobConf, reporter);

    //Make a mapper
    org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper;
    try {
        mapper = (org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor
                .newInstance();
        mapper.configure(jobConf);
    } catch (Exception e) {
        throw new RuntimeException("Cannot instantiate mapper " + mapperConstructor.getDeclaringClass(), e);
    }

    //These are to support map only jobs which write output directly to HDFS.
    final RecordWriter outputRecordWriter;
    OutputCommitter outputCommitter = null;
    TaskAttemptContext taskAttemptContext = null;

    if (mapOnlyJob) {

        taskAttemptContext = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID);
        OutputFormat outputFormat = jobConf.getOutputFormat();
        FileSystem fs = FileSystem.get(jobConf);
        outputRecordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat
                .getRecordWriter(fs, jobConf, ReducerWrapperMapred.getOutputName(splitIndex), Reporter.NULL);
        outputCommitter = jobConf.getOutputCommitter();

        //Create task object so it can handle file format initialization
        //The MapTask is private in the Hadoop 1.x so we have to go through reflection.
        try {
            Class reduceTask = Class.forName("org.apache.hadoop.mapred.MapTask");
            Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class,
                    TaskAttemptID.class, int.class, JobSplit.TaskSplitIndex.class, int.class);
            reduceTaskConstructor.setAccessible(true);
            Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, splitIndex,
                    new JobSplit.TaskSplitIndex(), 0);
            task.setConf(jobConf);
            task.initialize(jobConf, jobId, Reporter.NULL, false);
        } catch (Exception e) {
            throw new IOException("Cannot initialize MapTask", e);
        }
        outputCommitter.setupTask(taskAttemptContext);
    } else {
        outputRecordWriter = null;
    }

    OutputCollector<OUTKEY, OUTVALUE> outputCollector;

    if (!mapOnlyJob) {
        outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
            @Override
            public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
                try {
                    mapOutputAccumulator.combine(outkey, outvalue);
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                }
            }
        };
    } else {
        outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
            @Override
            public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
                outputRecordWriter.write(outkey, outvalue);
            }
        };
    }

    INKEY key = recordReader.createKey();
    INVALUE value = recordReader.createValue();

    while (recordReader.next(key, value)) {
        mapper.map(key, value, outputCollector, reporter);
    }
    mapper.close();

    recordReader.close();

    if (mapOnlyJob) {
        outputRecordWriter.close(Reporter.NULL);
        outputCommitter.commitTask(taskAttemptContext);
    }

}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java

License:Apache License

public ReducerWrapperMapred(HServerInvocationParameters invocationParameters, int hadoopPartition, int appId,
        int region, boolean sort) throws IOException, ClassNotFoundException, InterruptedException {
    this.invocationParameters = invocationParameters;
    JobConf jobConf = new JobConf((Configuration) invocationParameters.getConfiguration()); //Clone JobConf, so the temporary settings do not pollute other tasks

    LOG.info("Starting reducer:" + HadoopInvocationParameters.dumpConfiguration(jobConf));

    JobID jobID = (JobID) invocationParameters.getJobId();
    this.hadoopPartition = hadoopPartition;
    hadoopVersionSpecificCode = HadoopVersionSpecificCode.getInstance(invocationParameters.getHadoopVersion(),
            jobConf);/*ww  w  .  ja  v  a  2  s. co  m*/

    TaskAttemptID taskAttemptID = TaskAttemptID
            .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobID, false, hadoopPartition));

    updateJobConf(jobConf, taskAttemptID, region);

    context = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID);

    reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils
            .newInstance(jobConf.getReducerClass(), jobConf);

    reducer.configure(jobConf);

    OutputFormat outputFormat = jobConf.getOutputFormat();

    FileSystem fs = FileSystem.get(jobConf);
    recordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat.getRecordWriter(fs,
            jobConf, getOutputName(hadoopPartition), Reporter.NULL);

    committer = jobConf.getOutputCommitter();

    //Create task object so it can handle file format initialization
    //The ReduceTask is private in the Hadoop 1.x so we have to go through reflection.
    try {
        Class reduceTask = Class.forName("org.apache.hadoop.mapred.ReduceTask");
        Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class,
                int.class, int.class, int.class);
        reduceTaskConstructor.setAccessible(true);
        Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, hadoopPartition, 0, 0);
        task.setConf(jobConf);
        task.initialize(jobConf, jobID, Reporter.NULL, false);
    } catch (Exception e) {
        throw new IOException("Cannot initialize ReduceTask", e);
    }

    committer.setupTask(context);

    Class<INKEY> keyClass = (Class<INKEY>) jobConf.getMapOutputKeyClass();
    WritableSerializerDeserializer<INKEY> firstKeySerializer = new WritableSerializerDeserializer<INKEY>(
            keyClass, null);
    WritableSerializerDeserializer<INKEY> secondKeySerializer = new WritableSerializerDeserializer<INKEY>(
            keyClass, null);
    Class<INVALUE> valueClass = (Class<INVALUE>) jobConf.getMapOutputValueClass();
    WritableSerializerDeserializer<INVALUE> valueSerializer = new WritableSerializerDeserializer<INVALUE>(
            valueClass, null);

    DataGridReaderParameters<INKEY, INVALUE> params = new DataGridReaderParameters<INKEY, INVALUE>(region,
            appId, HServerParameters.getSetting(REDUCE_USEMEMORYMAPPEDFILES, jobConf) > 0, firstKeySerializer,
            valueSerializer, invocationParameters.getSerializationMode(), secondKeySerializer, keyClass,
            valueClass, sort, HServerParameters.getSetting(REDUCE_CHUNKSTOREADAHEAD, jobConf),
            1024 * HServerParameters.getSetting(REDUCE_INPUTCHUNKSIZE_KB, jobConf),
            HServerParameters.getSetting(REDUCE_CHUNKREADTIMEOUT, jobConf));
    transport = DataGridChunkedCollectionReader.getGridReader(params);
    outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
        @Override
        public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
            recordWriter.write(outkey, outvalue);
        }
    };
}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java

License:Apache License

public ReducerWrapperMapred(HServerInvocationParameters invocationParameters,
        final MapOutputAccumulator<OUTKEY, OUTVALUE> consumer,
        Class<? extends org.apache.hadoop.mapred.Reducer> combinerClass)
        throws IOException, ClassNotFoundException, InterruptedException {
    JobConf jobConf = (JobConf) invocationParameters.getConfiguration();

    reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils
            .newInstance(combinerClass, jobConf);

    outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
        @Override/*from  w w w.j av  a2 s .c  om*/
        public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
            try {
                consumer.saveCombineResult(outkey, outvalue);
            } catch (Exception e) {
                throw new IOException("Error while saving combined result.", e);
            }
        }
    };

}

From source file:edu.stolaf.cs.wmrserver.streaming.PipeMapRed.java

License:Apache License

void waitOutputThreads() {
    try {//from w w w  .j  ava  2s  .c o m
        if (outThread_ == null) {
            // This happens only when reducer has empty input(So reduce() is not
            // called at all in this task). If reducer still generates output,
            // which is very uncommon and we may not have to support this case.
            // So we don't write this output to HDFS, but we consume/collect
            // this output just to avoid reducer hanging forever.

            OutputCollector collector = new OutputCollector() {
                public void collect(Object key, Object value) throws IOException {
                    //just consume it, no need to write the record anywhere
                }
            };
            Reporter reporter = Reporter.NULL;//dummy reporter
            startOutputThreads(collector, reporter);
        }
        int exitVal = sim.waitFor();
        // how'd it go?
        if (exitVal != 0) {
            if (nonZeroExitIsFailure_) {
                throw new RuntimeException(
                        "PipeMapRed.waitOutputThreads(): subprocess failed with code " + exitVal);
            } else {
                logprintln("PipeMapRed.waitOutputThreads(): subprocess exited with code " + exitVal + " in "
                        + PipeMapRed.class.getName());
            }
        }
        if (outThread_ != null) {
            outThread_.join(joinDelay_);
        }
        if (errThread_ != null) {
            errThread_.join(joinDelay_);
        }
        if (outerrThreadsThrowable != null) {
            throw new RuntimeException(outerrThreadsThrowable);
        }
    } catch (InterruptedException e) {
        //ignore
    }
}

From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java

License:Open Source License

public static <S1 extends Shape, S2 extends Shape> int SpatialJoin_planeSweepFilterOnly(final List<S1> R,
        final List<S2> S, final ResultCollector2<S1, S2> output, Reporter reporter) throws IOException {

    LOG.debug("Start spatial join plan sweep algorithm !!!");

    final RectangleID[] Rmbrs = new RectangleID[R.size()];
    for (int i = 0; i < R.size(); i++) {
        Rmbrs[i] = new RectangleID(i, R.get(i).getMBR());
    }/*from www . ja  v a  2 s  .c  o m*/
    final RectangleID[] Smbrs = new RectangleID[S.size()];
    for (int i = 0; i < S.size(); i++) {
        Smbrs[i] = new RectangleID(i, S.get(i).getMBR());
    }

    final IntWritable count = new IntWritable();
    int filterCount = SpatialJoin_rectangles(Rmbrs, Smbrs, new OutputCollector<RectangleID, RectangleID>() {
        @Override
        public void collect(RectangleID r1, RectangleID r2) throws IOException {
            //if (R.get(r1.id).isIntersected(S.get(r2.id))) {
            if (output != null)
                output.collect(R.get(r1.id), S.get(r2.id));
            count.set(count.get() + 1);
            //}
        }
    }, reporter);

    LOG.debug("Filtered result size " + filterCount + ", refined result size " + count.get());

    return count.get();
}

From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java

License:Open Source License

/**
 * The general version of self join algorithm which works with arbitrary
 * shapes. First, it performs a filter step where it finds shapes with
 * overlapping MBRs. Second, an optional refine step can be executed to
 * return only shapes which actually overlap.
 * @param R - input set of shapes//from w  ww .j a v  a2s  .com
 * @param refine - Whether or not to run a refine step
 * @param output - output collector where the results are reported
 * @return - number of pairs returned by the planesweep algorithm
 * @throws IOException
 */
public static <S extends Shape> int SelfJoin_planeSweep(final S[] R, boolean refine,
        final OutputCollector<S, S> output, Progressable reporter) throws IOException {
    // Use a two-phase filter and refine approach
    // 1- Use MBRs as a first filter
    // 2- Use ConvexHull as a second filter
    // 3- Use the exact shape for refinement
    final RectangleID[] mbrs = new RectangleID[R.length];
    for (int i = 0; i < R.length; i++) {
        mbrs[i] = new RectangleID(i, R[i].getMBR());
    }

    if (refine) {
        final IntWritable count = new IntWritable();
        int filterCount = SelfJoin_rectangles(mbrs, new OutputCollector<RectangleID, RectangleID>() {
            @Override
            public void collect(RectangleID r1, RectangleID r2) throws IOException {
                if (R[r1.id].isIntersected(R[r2.id])) {
                    if (output != null)
                        output.collect(R[r1.id], R[r2.id]);
                    count.set(count.get() + 1);
                }
            }
        }, reporter);

        LOG.debug("Filtered result size " + filterCount + ", refined result size " + count.get());

        return count.get();
    } else {
        return SelfJoin_rectangles(mbrs, new OutputCollector<RectangleID, RectangleID>() {
            @Override
            public void collect(RectangleID r1, RectangleID r2) throws IOException {
                if (output != null)
                    output.collect(R[r1.id], R[r2.id]);
            }
        }, reporter);
    }
}

From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java

License:Open Source License

/**
 * Group polygons by overlap/*from   www  .j  a va  2  s.com*/
 * @param polygons
 * @param prog
 * @return
 * @throws IOException
 */
public static Geometry[][] groupPolygons(final Geometry[] polygons, final Progressable prog)
        throws IOException {
    // Group shapes into overlapping groups
    long t1 = System.currentTimeMillis();
    RectangleID[] mbrs = new RectangleID[polygons.length];
    for (int i = 0; i < polygons.length; i++) {
        Coordinate[] coords = polygons[i].getEnvelope().getCoordinates();
        double x1 = Math.min(coords[0].x, coords[2].x);
        double x2 = Math.max(coords[0].x, coords[2].x);
        double y1 = Math.min(coords[0].y, coords[2].y);
        double y2 = Math.max(coords[0].y, coords[2].y);

        mbrs[i] = new RectangleID(i, x1, y1, x2, y2);
    }

    // Parent link of the Set Union Find data structure
    final int[] parent = new int[mbrs.length];
    Arrays.fill(parent, -1);

    // Group records in clusters by overlapping
    SelfJoin_rectangles(mbrs, new OutputCollector<RectangleID, RectangleID>() {
        @Override
        public void collect(RectangleID r, RectangleID s) throws IOException {
            int rid = r.id;
            while (parent[rid] != -1) {
                int pid = parent[rid];
                if (parent[pid] != -1)
                    parent[rid] = parent[pid];
                rid = pid;
            }
            int sid = s.id;
            while (parent[sid] != -1) {
                int pid = parent[sid];
                if (parent[pid] != -1)
                    parent[sid] = parent[pid];
                sid = pid;
            }
            if (rid != sid)
                parent[rid] = sid;
        }
    }, prog);
    mbrs = null;
    // Put all records in one cluster as a list
    Map<Integer, List<Geometry>> groups = new HashMap<Integer, List<Geometry>>();
    for (int i = 0; i < parent.length; i++) {
        int root = parent[i];
        if (root == -1)
            root = i;
        while (parent[root] != -1) {
            root = parent[root];
        }
        List<Geometry> group = groups.get(root);
        if (group == null) {
            group = new Vector<Geometry>();
            groups.put(root, group);
        }
        group.add(polygons[i]);
    }
    long t2 = System.currentTimeMillis();

    Geometry[][] groupedPolygons = new Geometry[groups.size()][];
    int counter = 0;
    for (List<Geometry> group : groups.values()) {
        groupedPolygons[counter++] = group.toArray(new Geometry[group.size()]);
    }
    LOG.debug("Grouped " + parent.length + " shapes into " + groups.size() + " clusters in "
            + (t2 - t1) / 1000.0 + " seconds");
    return groupedPolygons;
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

private static long selfJoinLocal(Path in, Path out, OperationsParams params) throws IOException {
    if (isOneShotReadMode) {
        // Ensure all objects are read in one shot
        params.setInt(SpatialSite.MaxBytesInOneRead, -1);
        params.setInt(SpatialSite.MaxShapesInOneRead, -1);
    } else {//  w  w w.  j  a v  a2  s.com
        params.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead);
        params.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead);
    }
    ShapeArrayInputFormat inputFormat = new ShapeArrayInputFormat();
    JobConf job = new JobConf(params);
    FileInputFormat.addInputPath(job, in);
    InputSplit[] splits = inputFormat.getSplits(job, 1);
    FileSystem outFs = out.getFileSystem(params);
    final PrintStream writer = new PrintStream(outFs.create(out));

    // Process all input files
    long resultSize = 0;
    for (InputSplit split : splits) {
        ShapeArrayRecordReader reader = new ShapeArrayRecordReader(job, (FileSplit) split);
        final Text temp = new Text();

        Rectangle key = reader.createKey();
        ArrayWritable value = reader.createValue();
        if (reader.next(key, value)) {
            Shape[] writables = (Shape[]) value.get();
            resultSize += SpatialAlgorithms.SelfJoin_planeSweep(writables, true,
                    new OutputCollector<Shape, Shape>() {
                        @Override
                        public void collect(Shape r, Shape s) throws IOException {
                            writer.print(r.toText(temp));
                            writer.print(",");
                            writer.println(s.toText(temp));
                        }
                    }, null);
            if (reader.next(key, value)) {
                throw new RuntimeException("Error! Not all values read in one shot.");
            }
        }

        reader.close();
    }
    writer.close();

    return resultSize;
}

From source file:it.crs4.pydoop.pipes.PipesReducer.java

License:Apache License

/**
 * Handle the end of the input by closing down the application.
 *///from  ww  w .j  av  a 2 s .com
public void close() throws IOException {
    // if we haven't started the application, we have nothing to do
    if (isOk) {
        OutputCollector<K3, V3> nullCollector = new OutputCollector<K3, V3>() {
            public void collect(K3 key, V3 value) throws IOException {
                // NULL
            }
        };
        startApplication(nullCollector, Reporter.NULL);
    }
    try {
        if (isOk) {
            application.getDownlink().endOfInput();
        } else {
            // send the abort to the application and let it clean up
            application.getDownlink().abort();
        }
        LOG.info("waiting for finish");
        application.waitForFinish();
        LOG.info("got done");
    } catch (Throwable t) {
        application.abort(t);
    } finally {
        application.cleanup();
    }
}