Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter getWorkPath

List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter getWorkPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter getWorkPath.

Prototype

public Path getWorkPath() throws IOException 

Source Link

Document

Get the directory that the task should write results into.

Usage

From source file:co.nubetech.hiho.mapreduce.lib.output.AppendSequenceFileOutputFormat.java

License:Apache License

@Override
public Path getDefaultWorkFile(TaskAttemptContext context, String extension) throws IOException {
    Path p1;/*  w w w  .j  ava  2  s. c o  m*/
    isAppend = context.getConfiguration().get(HIHOConf.IS_APPEND, "false");
    if (isAppend.equalsIgnoreCase("false")) {
        p1 = super.getDefaultWorkFile(context, extension);
    } else {
        FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context);
        Path p = committer.getWorkPath();
        fileCount = p.getFileSystem(context.getConfiguration()).getContentSummary(getOutputPath(context))
                .getFileCount();
        if (fileCount > 1)
            fileCount = fileCount - 1;
        p1 = new Path(committer.getWorkPath(), getUniqueFile(context, "part", extension));
    }
    return p1;
}

From source file:co.nubetech.hiho.mapreduce.lib.output.AppendTextOutputFormat.java

License:Apache License

@Override
public Path getDefaultWorkFile(TaskAttemptContext context, String extension) throws IOException {
    Path p1;// w w w  . ja v a2s  .  c om
    isAppend = context.getConfiguration().get(HIHOConf.IS_APPEND);
    if (isAppend.equalsIgnoreCase("false")) {
        p1 = super.getDefaultWorkFile(context, extension);
    } else {
        FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context);
        Path p = committer.getWorkPath();
        fileCount = p.getFileSystem(context.getConfiguration()).getContentSummary(getOutputPath(context))
                .getFileCount();
        if (fileCount > 1) {
            fileCount = fileCount - 1;
        }
        p1 = new Path(committer.getWorkPath(), getUniqueFile(context, "part", extension));
    }
    return p1;
}

From source file:com.asakusafw.runtime.stage.output.TemporaryOutputFormat.java

License:Apache License

/**
 * Creates a new {@link RecordWriter} to output temporary data.
 * @param <V> value type/*from w ww  .  j  a v  a  2s .c  o m*/
 * @param context current context
 * @param name output name
 * @param dataType value type
 * @return the created writer
 * @throws IOException if failed to create a new {@link RecordWriter}
 * @throws InterruptedException if interrupted
 * @throws IllegalArgumentException if some parameters were {@code null}
 */
public <V> RecordWriter<NullWritable, V> createRecordWriter(TaskAttemptContext context, String name,
        Class<V> dataType) throws IOException, InterruptedException {
    if (context == null) {
        throw new IllegalArgumentException("context must not be null"); //$NON-NLS-1$
    }
    if (name == null) {
        throw new IllegalArgumentException("name must not be null"); //$NON-NLS-1$
    }
    if (dataType == null) {
        throw new IllegalArgumentException("dataType must not be null"); //$NON-NLS-1$
    }
    CompressionCodec codec = null;
    Configuration conf = context.getConfiguration();
    if (FileOutputFormat.getCompressOutput(context)) {
        Class<?> codecClass = FileOutputFormat.getOutputCompressorClass(context, DefaultCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
    }
    FileOutputCommitter committer = getOutputCommitter(context);
    final Path file = new Path(committer.getWorkPath(), FileOutputFormat.getUniqueFile(context, name, "")); //$NON-NLS-1$
    final ModelOutput<V> out = TemporaryStorage.openOutput(conf, dataType, file, codec);
    return new RecordWriter<NullWritable, V>() {

        @Override
        public void write(NullWritable key, V value) throws IOException {
            out.write(value);
        }

        @Override
        public void close(TaskAttemptContext ignored) throws IOException {
            out.close();
        }

        @Override
        public String toString() {
            return String.format("TemporaryOutput(%s)", file); //$NON-NLS-1$
        }
    };
}

From source file:com.bonc.mr_roamRecognition_hjpt.comm.NewFileOutputFormat.java

License:Apache License

/**
 * Get the {@link Path} to the task's temporary output directory for the
 * map-reduce job// w  ww  . j a  va2s  .  co m
 * 
 * <h4 id="SideEffectFiles">Tasks' Side-Effect Files</h4>
 * 
 * <p>
 * Some applications need to create/write-to side-files, which differ from
 * the actual job-outputs.
 * 
 * <p>
 * In such cases there could be issues with 2 instances of the same TIP
 * (running simultaneously e.g. speculative tasks) trying to open/write-to
 * the same file (path) on HDFS. Hence the application-writer will have to
 * pick unique names per task-attempt (e.g. using the attemptid, say
 * <tt>attempt_200709221812_0001_m_000000_0</tt>), not just per TIP.
 * </p>
 * 
 * <p>
 * To get around this the Map-Reduce framework helps the application-writer
 * out by maintaining a special
 * <tt>${mapreduce.output.fileoutputformat.outputdir}/_temporary/_${taskid}</tt>
 * sub-directory for each task-attempt on HDFS where the output of the
 * task-attempt goes. On successful completion of the task-attempt the files
 * in the
 * <tt>${mapreduce.output.fileoutputformat.outputdir}/_temporary/_${taskid}</tt>
 * (only) are <i>promoted</i> to
 * <tt>${mapreduce.output.fileoutputformat.outputdir}</tt>. Of course, the
 * framework discards the sub-directory of unsuccessful task-attempts. This
 * is completely transparent to the application.
 * </p>
 * 
 * <p>
 * The application-writer can take advantage of this by creating any
 * side-files required in a work directory during execution of his task i.e.
 * via {@link #getWorkOutputPath(TaskInputOutputContext)}, and the framework
 * will move them out similarly - thus she doesn't have to pick unique paths
 * per task-attempt.
 * </p>
 * 
 * <p>
 * The entire discussion holds true for maps of jobs with reducer=NONE (i.e.
 * 0 reduces) since output of the map, in that case, goes directly to HDFS.
 * </p>
 * 
 * @return the {@link Path} to the task's temporary output directory for the
 *         map-reduce job.
 */
public static Path getWorkOutputPath(TaskInputOutputContext<?, ?, ?, ?> context)
        throws IOException, InterruptedException {
    FileOutputCommitter committer = (FileOutputCommitter) context.getOutputCommitter();
    return committer.getWorkPath();
}

From source file:com.bonc.mr_roamRecognition_hjpt.comm.NewFileOutputFormat.java

License:Apache License

/**
 * Get the default path and filename for the output format.
 * /*from w w  w  . j a v  a2 s.c  om*/
 * @param context
 *            the task context
 * @param extension
 *            an extension to add to the filename
 * @return a full path $output/_temporary/$taskid/part-[mr]-$id
 * @throws IOException
 */
public Path getDefaultWorkFile(TaskAttemptContext context, String extension) throws IOException {
    FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context);
    return new Path(committer.getWorkPath(), getUniqueFile(context, getOutputName(context), extension));
}

From source file:com.cloudera.dataflow.spark.ShardNameTemplateHelper.java

License:Open Source License

public static <K, V> Path getDefaultWorkFile(FileOutputFormat<K, V> format, TaskAttemptContext context)
        throws IOException {
    FileOutputCommitter committer = (FileOutputCommitter) format.getOutputCommitter(context);
    return new Path(committer.getWorkPath(), getOutputFile(context));
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.ProxyOutputFormat.java

License:Apache License

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
    createOutputFormatIfNeeded(context);

    String outDir = context.getConfiguration().get("mapred.output.dir");
    originalDir = outDir;//w w w .  j  av  a2  s .c  om
    FileOutputCommitter committer = (FileOutputCommitter) super.getOutputCommitter(context);
    baseDir = committer.getWorkPath() + "";
    Configuration conf = new Configuration(context.getConfiguration());
    TaskAttemptContext reContext;
    try {
        reContext = TaskAttemptContextFactory.get(conf, context.getTaskAttemptID());
    } catch (Exception e) {
        throw new IOException(e);
    }

    reContext.getConfiguration().set("mapred.output.dir", baseDir);
    // This is for Hadoop 2.0 :
    reContext.getConfiguration().set("mapreduce.output.fileoutputformat.outputdir", baseDir);

    try {
        return new ProxyOutputCommitter(new Path(originalDir), context,
                outputFormat.getOutputCommitter(reContext));
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }
}

From source file:com.facebook.hiveio.common.HadoopUtils.java

License:Apache License

/**
 * Set worker output directory// w w  w  .  j  a v a  2  s .  com
 * @param context Task context
 * @throws IOException I/O errors
 */
public static void setWorkOutputDir(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String outputPath = getOutputDir(conf);
    // we need to do this to get the task path and set it for mapred
    // implementation since it can't be done automatically because of
    // mapreduce->mapred abstraction
    if (outputPath != null) {
        FileOutputCommitter foc = new FileOutputCommitter(getOutputPath(conf), context);
        Path path = foc.getWorkPath();
        FileSystem fs = path.getFileSystem(conf);
        fs.mkdirs(path);
        conf.set("mapred.work.output.dir", path.toString());
        LOG.info("Setting mapred.work.output.dir to {}", path.toString());
    }
}

From source file:com.linkedin.camus.etl.kafka.common.StringKafkaRecordWriterProvider.java

@Override
public RecordWriter<IEtlKey, CamusWrapper> getDataRecordWriter(final TaskAttemptContext context,
        final String fileName, CamusWrapper data, FileOutputCommitter committer)
        throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();

    Path file = committer.getWorkPath();
    file = new Path(file, EtlMultiOutputFormat.getUniqueFile(context, fileName, getFilenameExtension()));

    CompressionCodec codec = null;// ww w  .  j a v a2  s . co m
    SequenceFile.CompressionType compressionType = SequenceFile.CompressionType.NONE;

    final SequenceFile.Writer out = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file),
            SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class),
            SequenceFile.Writer.compression(compressionType, codec), SequenceFile.Writer.progressable(context));

    return new RecordWriter<IEtlKey, CamusWrapper>() {

        @Override
        public void write(IEtlKey iEtlKey, CamusWrapper camusWrapper) throws IOException {
            String record = (String) camusWrapper.getRecord() + recordDelimiter;
            out.append(new Text(String.valueOf(iEtlKey.getOffset())), new Text(record.getBytes()));
        }

        @Override
        public void close(TaskAttemptContext taskAttemptContext) throws IOException {
            out.close();
        }
    };
}

From source file:com.splout.db.hadoop.engine.SploutSQLProxyOutputFormat.java

License:Apache License

@Override
public RecordWriter<ITuple, NullWritable> getRecordWriter(TaskAttemptContext context)
        throws IOException, InterruptedException {

    long waitTimeHeartBeater = context.getConfiguration().getLong(HeartBeater.WAIT_TIME_CONF, 5000);
    heartBeater = new HeartBeater(context, waitTimeHeartBeater);
    heartBeater.needHeartBeat();//from w w w .j  ava2  s. co m
    conf = context.getConfiguration();
    this.context = context;

    outputFormat.setConf(context.getConfiguration());

    return new RecordWriter<ITuple, NullWritable>() {

        // Temporary and permanent Paths for properly writing Hadoop output files
        private Map<Integer, Path> permPool = new HashMap<Integer, Path>();
        private Map<Integer, Path> tempPool = new HashMap<Integer, Path>();

        private void initSql(int partition) throws IOException, InterruptedException {
            // HDFS final location of the generated partition file. It will be
            // loaded to the temporary folder in the HDFS than finally will be
            // committed by the OutputCommitter to the proper location.
            FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(
                    SploutSQLProxyOutputFormat.this.context);
            Path perm = new Path(committer.getWorkPath(), partition + ".db");
            FileSystem fs = perm.getFileSystem(conf);

            // Make a task unique name that contains the actual index output name to
            // make debugging simpler
            // Note: if using JVM reuse, the sequence number will not be reset for a
            // new task using the jvm
            Path temp = conf.getLocalPath("mapred.local.dir",
                    "splout_task_" + SploutSQLProxyOutputFormat.this.context.getTaskAttemptID() + '.'
                            + FILE_SEQUENCE.incrementAndGet());

            FileSystem localFileSystem = FileSystem.getLocal(conf);
            if (localFileSystem.exists(temp)) {
                localFileSystem.delete(temp, true);
            }
            localFileSystem.mkdirs(temp);

            Path local = fs.startLocalOutput(perm, new Path(temp, partition + ".db"));

            //
            permPool.put(partition, perm);
            tempPool.put(partition, new Path(temp, partition + ".db"));

            outputFormat.initPartition(partition, local);
        }

        @Override
        public void close(TaskAttemptContext ctx) throws IOException, InterruptedException {
            FileSystem fs = FileSystem.get(ctx.getConfiguration());
            try {
                if (ctx != null) {
                    heartBeater.setProgress(ctx);
                }
                outputFormat.close();
                for (Map.Entry<Integer, Path> entry : permPool.entrySet()) {
                    // Hadoop - completeLocalOutput()
                    fs.completeLocalOutput(entry.getValue(), tempPool.get(entry.getKey()));
                }
            } finally { // in any case, destroy the HeartBeater
                heartBeater.cancelHeartBeat();
            }
        }

        @Override
        public void write(ITuple tuple, NullWritable ignore) throws IOException, InterruptedException {
            int partition = (Integer) tuple.get(SploutSQLOutputFormat.PARTITION_TUPLE_FIELD);
            if (tempPool.get(partition) == null) {
                initSql(partition);
            }
            outputFormat.write(tuple);
        }

    };
}