Example usage for org.apache.hadoop.mapred JobConf getOutputFormat

List of usage examples for org.apache.hadoop.mapred JobConf getOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getOutputFormat.

Prototype

public OutputFormat getOutputFormat() 

Source Link

Document

Get the OutputFormat implementation for the map-reduce job, defaults to TextOutputFormat if not specified explicity.

Usage

From source file:colossal.pipe.ColHadoopReducer.java

License:Apache License

@Override
public void configure(JobConf conf) {
    super.configure(conf);
    isTextOutput = conf.getOutputFormat() instanceof TextOutputFormat;
}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapred.java

License:Apache License

/**
 * Runs mapper for the single split.//from w  w w.  ja va  2  s  .co  m
 *
 * @param mapOutputAccumulator mapOutputAccumulator to use
 * @param split                split ot run on
 */

@Override
@SuppressWarnings("unchecked")
public void runSplit(final MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split,
        int splitIndex) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf jobConf = new JobConf(this.jobConf); //Clone JobConf to prevent unexpected task interaction

    TaskAttemptID taskAttemptID = TaskAttemptID
            .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex));

    ReducerWrapperMapred.updateJobConf(jobConf, taskAttemptID, splitIndex);
    updateJobWithSplit(jobConf, split);

    InputFormat inputFormat = jobConf.getInputFormat();

    Reporter reporter = Reporter.NULL;

    //Create RecordReader
    org.apache.hadoop.mapred.RecordReader<INKEY, INVALUE> recordReader = inputFormat
            .getRecordReader((InputSplit) split, jobConf, reporter);

    //Make a mapper
    org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper;
    try {
        mapper = (org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor
                .newInstance();
        mapper.configure(jobConf);
    } catch (Exception e) {
        throw new RuntimeException("Cannot instantiate mapper " + mapperConstructor.getDeclaringClass(), e);
    }

    //These are to support map only jobs which write output directly to HDFS.
    final RecordWriter outputRecordWriter;
    OutputCommitter outputCommitter = null;
    TaskAttemptContext taskAttemptContext = null;

    if (mapOnlyJob) {

        taskAttemptContext = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID);
        OutputFormat outputFormat = jobConf.getOutputFormat();
        FileSystem fs = FileSystem.get(jobConf);
        outputRecordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat
                .getRecordWriter(fs, jobConf, ReducerWrapperMapred.getOutputName(splitIndex), Reporter.NULL);
        outputCommitter = jobConf.getOutputCommitter();

        //Create task object so it can handle file format initialization
        //The MapTask is private in the Hadoop 1.x so we have to go through reflection.
        try {
            Class reduceTask = Class.forName("org.apache.hadoop.mapred.MapTask");
            Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class,
                    TaskAttemptID.class, int.class, JobSplit.TaskSplitIndex.class, int.class);
            reduceTaskConstructor.setAccessible(true);
            Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, splitIndex,
                    new JobSplit.TaskSplitIndex(), 0);
            task.setConf(jobConf);
            task.initialize(jobConf, jobId, Reporter.NULL, false);
        } catch (Exception e) {
            throw new IOException("Cannot initialize MapTask", e);
        }
        outputCommitter.setupTask(taskAttemptContext);
    } else {
        outputRecordWriter = null;
    }

    OutputCollector<OUTKEY, OUTVALUE> outputCollector;

    if (!mapOnlyJob) {
        outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
            @Override
            public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
                try {
                    mapOutputAccumulator.combine(outkey, outvalue);
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                }
            }
        };
    } else {
        outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
            @Override
            public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
                outputRecordWriter.write(outkey, outvalue);
            }
        };
    }

    INKEY key = recordReader.createKey();
    INVALUE value = recordReader.createValue();

    while (recordReader.next(key, value)) {
        mapper.map(key, value, outputCollector, reporter);
    }
    mapper.close();

    recordReader.close();

    if (mapOnlyJob) {
        outputRecordWriter.close(Reporter.NULL);
        outputCommitter.commitTask(taskAttemptContext);
    }

}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java

License:Apache License

public ReducerWrapperMapred(HServerInvocationParameters invocationParameters, int hadoopPartition, int appId,
        int region, boolean sort) throws IOException, ClassNotFoundException, InterruptedException {
    this.invocationParameters = invocationParameters;
    JobConf jobConf = new JobConf((Configuration) invocationParameters.getConfiguration()); //Clone JobConf, so the temporary settings do not pollute other tasks

    LOG.info("Starting reducer:" + HadoopInvocationParameters.dumpConfiguration(jobConf));

    JobID jobID = (JobID) invocationParameters.getJobId();
    this.hadoopPartition = hadoopPartition;
    hadoopVersionSpecificCode = HadoopVersionSpecificCode.getInstance(invocationParameters.getHadoopVersion(),
            jobConf);//from  ww  w  .ja v  a 2 s.c  om

    TaskAttemptID taskAttemptID = TaskAttemptID
            .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobID, false, hadoopPartition));

    updateJobConf(jobConf, taskAttemptID, region);

    context = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID);

    reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils
            .newInstance(jobConf.getReducerClass(), jobConf);

    reducer.configure(jobConf);

    OutputFormat outputFormat = jobConf.getOutputFormat();

    FileSystem fs = FileSystem.get(jobConf);
    recordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat.getRecordWriter(fs,
            jobConf, getOutputName(hadoopPartition), Reporter.NULL);

    committer = jobConf.getOutputCommitter();

    //Create task object so it can handle file format initialization
    //The ReduceTask is private in the Hadoop 1.x so we have to go through reflection.
    try {
        Class reduceTask = Class.forName("org.apache.hadoop.mapred.ReduceTask");
        Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class,
                int.class, int.class, int.class);
        reduceTaskConstructor.setAccessible(true);
        Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, hadoopPartition, 0, 0);
        task.setConf(jobConf);
        task.initialize(jobConf, jobID, Reporter.NULL, false);
    } catch (Exception e) {
        throw new IOException("Cannot initialize ReduceTask", e);
    }

    committer.setupTask(context);

    Class<INKEY> keyClass = (Class<INKEY>) jobConf.getMapOutputKeyClass();
    WritableSerializerDeserializer<INKEY> firstKeySerializer = new WritableSerializerDeserializer<INKEY>(
            keyClass, null);
    WritableSerializerDeserializer<INKEY> secondKeySerializer = new WritableSerializerDeserializer<INKEY>(
            keyClass, null);
    Class<INVALUE> valueClass = (Class<INVALUE>) jobConf.getMapOutputValueClass();
    WritableSerializerDeserializer<INVALUE> valueSerializer = new WritableSerializerDeserializer<INVALUE>(
            valueClass, null);

    DataGridReaderParameters<INKEY, INVALUE> params = new DataGridReaderParameters<INKEY, INVALUE>(region,
            appId, HServerParameters.getSetting(REDUCE_USEMEMORYMAPPEDFILES, jobConf) > 0, firstKeySerializer,
            valueSerializer, invocationParameters.getSerializationMode(), secondKeySerializer, keyClass,
            valueClass, sort, HServerParameters.getSetting(REDUCE_CHUNKSTOREADAHEAD, jobConf),
            1024 * HServerParameters.getSetting(REDUCE_INPUTCHUNKSIZE_KB, jobConf),
            HServerParameters.getSetting(REDUCE_CHUNKREADTIMEOUT, jobConf));
    transport = DataGridChunkedCollectionReader.getGridReader(params);
    outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() {
        @Override
        public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException {
            recordWriter.write(outkey, outvalue);
        }
    };
}

From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java

License:Apache License

/**
 * Runs the map-reduce job on ScaleOut hServer.*
 *
 * @param job          the job to run//from   ww w . j a  v  a  2  s .c  o m
 * @param jobId        the id of the job
 * @param sortEnabled  if key sorting is enabled
 * @param jobParameter user defined parameter object for the job
 * @param grid         the invocation grid to run the job
 * @throws IOException            if errors occurred during the job
 * @throws InterruptedException   if the processing thread is interrupted
 * @throws ClassNotFoundException if the invocation grid does not contain the dependency class
 */
@SuppressWarnings("unchecked")
public void runOldApiJob(JobConf job, org.apache.hadoop.mapred.JobID jobId, boolean sortEnabled,
        Object jobParameter, InvocationGrid grid)
        throws IOException, InterruptedException, ClassNotFoundException {
    //Initialize user credential in advance
    int jobAppId = 0xFFFFFFF & BitConverter.hashStringOneInt(jobId.toString());
    String hadoopVersion = VersionInfo.getVersion();
    long time = System.currentTimeMillis();
    CreateUserCredentials.run(grid);

    try {
        //Check output specs before running the job
        job.getOutputFormat().checkOutputSpecs(FileSystem.get(job), job);

        JobContext jContext = HadoopVersionSpecificCode.getInstance(hadoopVersion, job).createJobContext(job,
                jobId);

        org.apache.hadoop.mapred.OutputCommitter outputCommitter = job.getOutputCommitter();
        outputCommitter.setupJob(jContext);

        //clear all temporary objects
        DataAccessor.clearObjects(jobAppId);

        //Calculating the partition layout
        com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping
                .getCurrent();
        List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts());

        //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts
        int numHosts = hostAddresses.size();
        int numberOfSlotsPerNode = Math
                .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1);

        //Set the number of splits to the number of cores
        if (NamedMapInputFormatMapred.class.isAssignableFrom(job.getInputFormat().getClass())) {
            int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job) * numHosts
                    * numberOfSlotsPerNode;
            job.setNumMapTasks(Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS));
        }

        //Generating split to hostname map
        org.apache.hadoop.mapred.InputFormat inputFormat = job.getInputFormat();
        List<org.apache.hadoop.mapred.InputSplit> splitList = Arrays
                .asList(inputFormat.getSplits(job, job.getNumMapTasks()));
        Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null);

        //Choose the optimal number of reducers for GridOutputFormat
        if (job.getOutputFormat() instanceof NamedMapOutputFormatMapred) {
            job.setNumReduceTasks(numHosts * numberOfSlotsPerNode);
            sortEnabled = false;
        }

        int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks());

        //Generating invocation parameters
        Class<? extends org.apache.hadoop.mapred.InputSplit> splitType = splitList.size() > 0
                ? splitList.get(0).getClass()
                : null;

        HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job, jobId, true);

        HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit> parameters = new HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit>(
                hadoopParameters, jobAppId, partitionMapping, hostNameToPartition, numberOfSlotsPerNode,
                splitType, splitList, splitToHostAddress, false, sortEnabled, hadoopVersion, jobParameter,
                SerializationMode.DEFAULT);

        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("Splits created:\n");
        for (InetAddress address : splitToHostAddress.keySet()) {
            stringBuilder.append("Host ");
            stringBuilder.append(address);
            stringBuilder.append(" has ");
            stringBuilder.append(splitToHostAddress.get(address).size());
            stringBuilder.append(" splits.\n");
        }
        System.out.println(stringBuilder.toString());

        System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid,
                RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds());

        if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) {
            throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0));
        }

        System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms.");
        time = System.currentTimeMillis();

        MapperResult resultObject = mapInvokeResult.getResult();

        if (resultObject == null || mapInvokeResult.getNumFailed() != 0) {
            throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed());
        }

        if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) {
            throw new IOException("Number of splits does not match the number of invocations. Nsplits = "
                    + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed());
        }

        if (partitionMapping.length > 0) {
            //Running the reduce step
            InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class,
                    jobAppId, TimeSpan.INFINITE_TIMEOUT.getSeconds());

            System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms.");

            DataAccessor.clearObjects(jobAppId); //clear all temporary objects

            if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) {
                throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0));
            }
            if (reduceInvokeResult.getNumFailed() != 0) {
                throw new IOException("Reduce invocation failed.");
            }
            if (reduceInvokeResult.getResult() != partitionMapping.length) {
                throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length
                        + " Actual = " + reduceInvokeResult.getResult());
            }
        }
        outputCommitter.commitJob(jContext);
    } catch (StateServerException e) {
        throw new IOException("ScaleOut hServer access error.", e);
    }

}

From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopWriteOperatorDescriptor.java

License:Apache License

private static FileSplit[] getOutputSplits(JobConf conf, int noOfMappers) throws ClassNotFoundException {
    int numOutputters = conf.getNumReduceTasks() != 0 ? conf.getNumReduceTasks() : noOfMappers;
    Object outputFormat = null;// w  w w .j  av  a  2  s.c  om
    if (conf.getUseNewMapper()) {
        outputFormat = ReflectionUtils
                .newInstance(new ContextFactory().createJobContext(conf).getOutputFormatClass(), conf);
    } else {
        outputFormat = conf.getOutputFormat();
    }
    if (outputFormat instanceof NullOutputFormat) {
        FileSplit[] outputFileSplits = new FileSplit[numOutputters];
        for (int i = 0; i < numOutputters; i++) {
            String outputPath = "/tmp/" + System.currentTimeMillis() + i;
            outputFileSplits[i] = new FileSplit("localhost", new FileReference(new File(outputPath)));
        }
        return outputFileSplits;
    } else {

        FileSplit[] outputFileSplits = new FileSplit[numOutputters];
        String absolutePath = FileOutputFormat.getOutputPath(conf).toString();
        for (int index = 0; index < numOutputters; index++) {
            String suffix = new String("part-00000");
            suffix = new String(suffix.substring(0, suffix.length() - ("" + index).length()));
            suffix = suffix + index;
            String outputPath = absolutePath + "/" + suffix;
            outputFileSplits[index] = new FileSplit("localhost", outputPath);
        }
        return outputFileSplits;
    }
}

From source file:it.crs4.pydoop.pipes.Submitter.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    CommandLineParser cli = new CommandLineParser();
    if (args.length == 0) {
        cli.printUsage();/*from w w  w  .  j av a 2s  .  c o m*/
        return 1;
    }
    cli.addOption("input", false, "input path to the maps", "path");
    cli.addOption("output", false, "output path from the reduces", "path");

    cli.addOption("jar", false, "job jar file", "path");
    cli.addOption("inputformat", false, "java classname of InputFormat", "class");
    //cli.addArgument("javareader", false, "is the RecordReader in Java");
    cli.addOption("map", false, "java classname of Mapper", "class");
    cli.addOption("partitioner", false, "java classname of Partitioner", "class");
    cli.addOption("reduce", false, "java classname of Reducer", "class");
    cli.addOption("writer", false, "java classname of OutputFormat", "class");
    cli.addOption("program", false, "URI to application executable", "class");
    cli.addOption("reduces", false, "number of reduces", "num");
    cli.addOption("jobconf", false,
            "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val");
    cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean");
    Parser parser = cli.createParser();
    try {

        GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args);
        CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs());

        JobConf job = new JobConf(getConf());

        if (results.hasOption("input")) {
            FileInputFormat.setInputPaths(job, results.getOptionValue("input"));
        }
        if (results.hasOption("output")) {
            FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output")));
        }
        if (results.hasOption("jar")) {
            job.setJar(results.getOptionValue("jar"));
        }
        if (results.hasOption("inputformat")) {
            setIsJavaRecordReader(job, true);
            job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class));
        }
        if (results.hasOption("javareader")) {
            setIsJavaRecordReader(job, true);
        }
        if (results.hasOption("map")) {
            setIsJavaMapper(job, true);
            job.setMapperClass(getClass(results, "map", job, Mapper.class));
        }
        if (results.hasOption("partitioner")) {
            job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class));
        }
        if (results.hasOption("reduce")) {
            setIsJavaReducer(job, true);
            job.setReducerClass(getClass(results, "reduce", job, Reducer.class));
        }
        if (results.hasOption("reduces")) {
            job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
        }
        if (results.hasOption("writer")) {
            setIsJavaRecordWriter(job, true);
            job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class));
        }

        if (results.hasOption("lazyOutput")) {
            if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
                LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass());
            }
        }

        if (results.hasOption("program")) {
            setExecutable(job, results.getOptionValue("program"));
        }
        if (results.hasOption("jobconf")) {
            LOG.warn("-jobconf option is deprecated, please use -D instead.");
            String options = results.getOptionValue("jobconf");
            StringTokenizer tokenizer = new StringTokenizer(options, ",");
            while (tokenizer.hasMoreTokens()) {
                String keyVal = tokenizer.nextToken().trim();
                String[] keyValSplit = keyVal.split("=");
                job.set(keyValSplit[0], keyValSplit[1]);
            }
        }
        // if they gave us a jar file, include it into the class path
        String jarFile = job.getJar();
        if (jarFile != null) {
            final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() };
            //FindBugs complains that creating a URLClassLoader should be
            //in a doPrivileged() block. 
            ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {
                public ClassLoader run() {
                    return new URLClassLoader(urls);
                }
            });
            job.setClassLoader(loader);
        }

        runJob(job);
        return 0;
    } catch (ParseException pe) {
        LOG.info("Error : " + pe);
        cli.printUsage();
        return 1;
    }

}

From source file:org.apache.ignite.internal.processors.hadoop.impl.v1.HadoopV1OutputCollector.java

License:Apache License

/**
 * @param jobConf Job configuration./*from  ww w  .  ja v  a 2  s  .  c  om*/
 * @param taskCtx Task context.
 * @param directWrite Direct write flag.
 * @param fileName File name.
 * @throws IOException In case of IO exception.
 */
HadoopV1OutputCollector(JobConf jobConf, HadoopTaskContext taskCtx, boolean directWrite,
        @Nullable String fileName, TaskAttemptID attempt) throws IOException {
    this.jobConf = jobConf;
    this.taskCtx = taskCtx;
    this.attempt = attempt;

    if (directWrite) {
        jobConf.set("mapreduce.task.attempt.id", attempt.toString());

        OutputFormat outFormat = jobConf.getOutputFormat();

        writer = outFormat.getRecordWriter(null, jobConf, fileName, Reporter.NULL);
    } else
        writer = null;
}

From source file:org.apache.ignite.internal.processors.hadoop.v1.GridHadoopV1OutputCollector.java

License:Apache License

/**
 * @param jobConf Job configuration.// www .  j a  va 2 s .  c  om
 * @param taskCtx Task context.
 * @param directWrite Direct write flag.
 * @param fileName File name.
 * @throws IOException In case of IO exception.
 */
GridHadoopV1OutputCollector(JobConf jobConf, GridHadoopTaskContext taskCtx, boolean directWrite,
        @Nullable String fileName, TaskAttemptID attempt) throws IOException {
    this.jobConf = jobConf;
    this.taskCtx = taskCtx;
    this.attempt = attempt;

    if (directWrite) {
        jobConf.set("mapreduce.task.attempt.id", attempt.toString());

        OutputFormat outFormat = jobConf.getOutputFormat();

        writer = outFormat.getRecordWriter(null, jobConf, fileName, Reporter.NULL);
    } else
        writer = null;
}

From source file:org.archive.jbs.Merge.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("jbs.Merge <output> <input>...");
        return 1;
    }//from w ww.  j av a 2s .  c om

    JobConf conf = new JobConf(getConf(), Merge.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    // Choose the outputformat to either merge or index the records
    //
    // org.archive.jbs.lucene.LuceneOutputFormat
    //    - builds local Lucene index
    //
    // org.archive.jbs.solr.SolrOutputFormat
    //    - sends documents to remote Solr server
    //
    // org.apache.hadoop.mapred.MapFileOutputFormat
    //    - writes merged documents to Hadoop MapFile
    conf.setOutputFormat((Class) Class
            .forName(conf.get("jbs.outputformat.class", "org.apache.hadoop.mapred.MapFileOutputFormat")));

    // Set the Hadoop job name to incorporate the output format name.
    String formatName = conf.getOutputFormat().getClass().getName();
    conf.setJobName("jbs.Merge "
            + formatName.substring(formatName.lastIndexOf('.') != -1 ? (formatName.lastIndexOf('.') + 1) : 0));

    // Add the input paths as either NutchWAX segment directories or
    // text .dup files.
    for (int i = 1; i < args.length; i++) {
        Path p = new Path(args[i]);

        // Expand any file globs and then check each matching path
        FileStatus[] files = FileSystem.get(conf).globStatus(p);

        for (FileStatus file : files) {
            if (file.isDir()) {
                // If it's a directory, then check if it is a Nutch segment, otherwise treat as a SequenceFile.
                if (p.getFileSystem(conf).exists(new Path(file.getPath(), "parse_data"))) {
                    LOG.info("Input NutchWax: " + file.getPath());
                    MultipleInputs.addInputPath(conf, new Path(file.getPath(), "parse_data"),
                            SequenceFileInputFormat.class, NutchMapper.class);
                    MultipleInputs.addInputPath(conf, new Path(file.getPath(), "parse_text"),
                            SequenceFileInputFormat.class, NutchMapper.class);
                } else {
                    // Assume it's a SequenceFile of JSON-encoded Documents.
                    LOG.info("Input Document: " + file.getPath());
                    MultipleInputs.addInputPath(conf, file.getPath(), SequenceFileInputFormat.class,
                            DocumentMapper.class);
                }
            } else {
                // Not a directory, assume it's a text file, either CDX or property specifications.
                LOG.info("Input TextFile: " + file.getPath());
                MultipleInputs.addInputPath(conf, file.getPath(), TextInputFormat.class, TextMapper.class);
            }
        }
    }

    FileOutputFormat.setOutputPath(conf, new Path(args[0]));

    RunningJob rj = JobClient.runJob(conf);

    return rj.isSuccessful() ? 0 : 1;
}

From source file:tap.core.ReducerBridge.java

License:Apache License

@Override
public void configure(JobConf conf) {
    super.configure(conf);

    isTextOutput = conf.getOutputFormat() instanceof TextOutputFormat;
    isProtoOutput = conf.getOutputFormat() instanceof TapfileOutputFormat;

    if (isProtoOutput) {
        try {//from w  ww  .j  a  va2s  .  co m
            mapOutClass = Class.forName(conf.get(Phase.MAP_OUT_CLASS));
            reduceOutClass = Class.forName(conf.get(Phase.REDUCE_OUT_CLASS));
            if (mapOutClass != reduceOutClass) {
                reduceOutKeyChanges = true;
                String groupBy = conf.get(Phase.GROUP_BY);
                String sortBy = conf.get(Phase.SORT_BY);
                reduceOutSchema = ReflectUtils.getSchema(ObjectFactory.newInstance(reduceOutClass));
                extractor = ReflectionKeyExtractor.getReflectionKeyExtractorForReduceOutKey(reduceOutSchema,
                        groupBy, sortBy);
            }
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    multiOutputPrefix = conf.get(Phase.MULTIPLE_OUTPUT_PREFIX);
    if (multiOutputPrefix == null)
        multiOutputPrefix = "out";

    MultipleOutputs.addMultiNamedOutput(conf, multiOutputPrefix, conf.getOutputFormat().getClass(),
            conf.getOutputKeyClass(), conf.getOutputValueClass());

    this.multiOutput = new MultipleOutputs(conf);
}