Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter FileOutputCommitter

List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter FileOutputCommitter

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter FileOutputCommitter.

Prototype

@Private
public FileOutputCommitter(Path outputPath, JobContext context) throws IOException 

Source Link

Document

Create a file output committer

Usage

From source file:mlbench.bayes.train.IndexInstances.java

License:Apache License

@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException {
    parseArgs(args);/*from w ww .ja va2  s  . co m*/
    HashMap<String, String> conf = new HashMap<String, String>();
    initConf(conf);
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);

        if (rank == 0) {
            System.out.println(IndexInstances.class.getSimpleName() + " O start.");
            createLabelIndex(labPath);
        }

        HadoopUtil.cacheFiles(labPath, config);

        MPI_D.COMM_BIPARTITE_O.Barrier();

        OpenObjectIntHashMap<String> labelIndex = BayesUtils.readIndexFromCache(config);

        if (MPI_D.COMM_BIPARTITE_O != null) {
            // O communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
            int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
            FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                    (JobConf) config, inDir, rank);
            for (int i = 0; i < inputs.length; i++) {
                FileSplit fsplit = inputs[i];
                SequenceFileRecordReader<Text, VectorWritable> kvrr = new SequenceFileRecordReader<>(config,
                        fsplit);
                Text labelText = kvrr.createKey();
                VectorWritable instance = kvrr.createValue();
                while (kvrr.next(labelText, instance)) {
                    String label = SLASH.split(labelText.toString())[1];
                    if (labelIndex.containsKey(label)) {
                        MPI_D.Send(new IntWritable(labelIndex.get(label)), instance);
                    }
                }
            }
        }
    } else if (MPI_D.COMM_BIPARTITE_A != null) {
        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
        config.set(MAPRED_OUTPUT_DIR, outDir);
        config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString());
        ((JobConf) config).setOutputKeyClass(IntWritable.class);
        ((JobConf) config).setOutputValueClass(VectorWritable.class);
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(config,
                DataMPIUtil.getHadoopTaskAttemptID());
        SequenceFileOutputFormat<IntWritable, VectorWritable> outfile = new SequenceFileOutputFormat<>();
        FileSystem fs = FileSystem.get(config);

        Path output = new Path(config.get(MAPRED_OUTPUT_DIR));
        FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext);
        RecordWriter<IntWritable, VectorWritable> outrw = null;
        try {
            fcommitter.setupJob(taskContext);
            outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("ERROR: Please set the HDFS configuration properly\n");
            System.exit(-1);
        }

        IntWritable key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        Vector vector = null;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
            } else if (!key.equals(newKey)) {
                outrw.write(key, new VectorWritable(vector));
                vector = null;
            }
            if (vector == null) {
                vector = newPoint.get();
            } else {
                vector.assign(newPoint.get(), Functions.PLUS);
            }

            key = newKey;
            point = newPoint;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            outrw.write(key, new VectorWritable(vector));
        }

        outrw.close(null);
        if (fcommitter.needsTaskCommit(taskContext)) {
            fcommitter.commitTask(taskContext);
        }
    }

    MPI_D.Finalize();
}

From source file:mlbench.bayes.train.WeightSummer.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException {
    parseArgs(args);/*from   ww w  .ja  v a 2s . com*/
    HashMap<String, String> conf = new HashMap<String, String>();
    initConf(conf);
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {

        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                (JobConf) config, inDir, rank);
        Vector weightsPerFeature = null;
        Vector weightsPerLabel = new DenseVector(labNum);

        for (int i = 0; i < inputs.length; i++) {
            FileSplit fsplit = inputs[i];
            SequenceFileRecordReader<IntWritable, VectorWritable> kvrr = new SequenceFileRecordReader<>(config,
                    fsplit);
            IntWritable index = kvrr.createKey();
            VectorWritable value = kvrr.createValue();
            while (kvrr.next(index, value)) {
                Vector instance = value.get();
                if (weightsPerFeature == null) {
                    weightsPerFeature = new RandomAccessSparseVector(instance.size(),
                            instance.getNumNondefaultElements());
                }

                int label = index.get();
                weightsPerFeature.assign(instance, Functions.PLUS);
                weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum());
            }
        }
        if (weightsPerFeature != null) {
            MPI_D.Send(new Text(WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature));
            MPI_D.Send(new Text(WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel));
        }
    } else if (MPI_D.COMM_BIPARTITE_A != null) {
        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
        config.set(MAPRED_OUTPUT_DIR, outDirW);
        config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString());
        ((JobConf) config).setOutputKeyClass(Text.class);
        ((JobConf) config).setOutputValueClass(VectorWritable.class);
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(config,
                DataMPIUtil.getHadoopTaskAttemptID());
        SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>();
        FileSystem fs = FileSystem.get(config);

        Path output = new Path(config.get(MAPRED_OUTPUT_DIR));
        FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext);
        RecordWriter<Text, VectorWritable> outrw = null;
        try {
            fcommitter.setupJob(taskContext);
            outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("ERROR: Please set the HDFS configuration properly\n");
            System.exit(-1);
        }

        Text key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        Vector vector = null;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (Text) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
            } else if (!key.equals(newKey)) {
                outrw.write(key, new VectorWritable(vector));
                vector = null;
            }
            if (vector == null) {
                vector = newPoint.get();
            } else {
                vector.assign(newPoint.get(), Functions.PLUS);
            }

            key = newKey;
            point = newPoint;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            outrw.write(key, new VectorWritable(vector));
        }

        outrw.close(null);
        if (fcommitter.needsTaskCommit(taskContext)) {
            fcommitter.commitTask(taskContext);
        }

        MPI_D.COMM_BIPARTITE_A.Barrier();
        if (rank == 0) {
            Path resOut = new Path(outDir);
            NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path(outDir), config);
            naiveBayesModel.serialize(resOut, config);
        }
    }

    MPI_D.Finalize();
}

From source file:org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapperTest.java

License:Apache License

@Before
public void setup() throws Exception {

    conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
    conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
    conf.set(WikipediaConfiguration.TABLE_NAME, TABLE_NAME);
    conf.set(WikipediaConfiguration.NUM_PARTITIONS, "1");
    conf.set(WikipediaConfiguration.NUM_GROUPS, "1");

    MockInstance i = new MockInstance();
    c = i.getConnector("root", "pass");
    c.tableOperations().delete(METADATA_TABLE_NAME);
    c.tableOperations().delete(TABLE_NAME);
    c.tableOperations().delete(INDEX_TABLE_NAME);
    c.tableOperations().delete(RINDEX_TABLE_NAME);
    c.tableOperations().create(METADATA_TABLE_NAME);
    c.tableOperations().create(TABLE_NAME);
    c.tableOperations().create(INDEX_TABLE_NAME);
    c.tableOperations().create(RINDEX_TABLE_NAME);

    writerMap.put(new Text(METADATA_TABLE_NAME), c.createBatchWriter(METADATA_TABLE_NAME, 1000L, 1000L, 1));
    writerMap.put(new Text(TABLE_NAME), c.createBatchWriter(TABLE_NAME, 1000L, 1000L, 1));
    writerMap.put(new Text(INDEX_TABLE_NAME), c.createBatchWriter(INDEX_TABLE_NAME, 1000L, 1000L, 1));
    writerMap.put(new Text(RINDEX_TABLE_NAME), c.createBatchWriter(RINDEX_TABLE_NAME, 1000L, 1000L, 1));

    TaskAttemptID id = new TaskAttemptID();
    TaskAttemptContext context = new TaskAttemptContext(conf, id);

    RawLocalFileSystem fs = new RawLocalFileSystem();
    fs.setConf(conf);//from   w  w w.j  a  v a 2  s.c  om

    URL url = ClassLoader.getSystemResource("enwiki-20110901-001.xml");
    Assert.assertNotNull(url);
    File data = new File(url.toURI());
    Path tmpFile = new Path(data.getAbsolutePath());

    // Setup the Mapper
    InputSplit split = new FileSplit(tmpFile, 0, fs.pathToFile(tmpFile).length(), null);
    AggregatingRecordReader rr = new AggregatingRecordReader();
    Path ocPath = new Path(tmpFile, "oc");
    OutputCommitter oc = new FileOutputCommitter(ocPath, context);
    fs.deleteOnExit(ocPath);
    StandaloneStatusReporter sr = new StandaloneStatusReporter();
    rr.initialize(split, context);
    MockAccumuloRecordWriter rw = new MockAccumuloRecordWriter();
    WikipediaMapper mapper = new WikipediaMapper();

    // Load data into Mock Accumulo
    Mapper<LongWritable, Text, Text, Mutation>.Context con = mapper.new Context(conf, id, rr, rw, oc, sr,
            split);
    mapper.run(con);

    // Flush and close record writers.
    rw.close(context);

}

From source file:org.apache.accumulo.examples.wikisearch.logic.TestQueryLogic.java

License:Apache License

@Before
public void setup() throws Exception {

    Logger.getLogger(AbstractQueryLogic.class).setLevel(Level.DEBUG);
    Logger.getLogger(QueryLogic.class).setLevel(Level.DEBUG);
    Logger.getLogger(RangeCalculator.class).setLevel(Level.DEBUG);

    conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
    conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
    conf.set(WikipediaConfiguration.TABLE_NAME, TABLE_NAME);
    conf.set(WikipediaConfiguration.NUM_PARTITIONS, "1");
    conf.set(WikipediaConfiguration.NUM_GROUPS, "1");

    MockInstance i = new MockInstance();
    c = i.getConnector("root", new PasswordToken(""));
    WikipediaIngester.createTables(c.tableOperations(), TABLE_NAME, false);
    for (String table : TABLE_NAMES) {
        writerMap.put(new Text(table), c.createBatchWriter(table, 1000L, 1000L, 1));
    }/*from   www .  jav  a2s .co m*/

    TaskAttemptID id = new TaskAttemptID("fake", 1, TaskType.MAP, 1, 1);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, id);

    RawLocalFileSystem fs = new RawLocalFileSystem();
    fs.setConf(conf);

    URL url = ClassLoader.getSystemResource("enwiki-20110901-001.xml");
    Assert.assertNotNull(url);
    File data = new File(url.toURI());
    Path tmpFile = new Path(data.getAbsolutePath());

    // Setup the Mapper
    WikipediaInputSplit split = new WikipediaInputSplit(
            new FileSplit(tmpFile, 0, fs.pathToFile(tmpFile).length(), null), 0);
    AggregatingRecordReader rr = new AggregatingRecordReader();
    Path ocPath = new Path(tmpFile, "oc");
    OutputCommitter oc = new FileOutputCommitter(ocPath, context);
    fs.deleteOnExit(ocPath);
    StandaloneStatusReporter sr = new StandaloneStatusReporter();
    rr.initialize(split, context);
    MockAccumuloRecordWriter rw = new MockAccumuloRecordWriter();
    WikipediaMapper mapper = new WikipediaMapper();

    // there are times I wonder, "Why do Java people think this is good?" then I drink more whiskey
    final MapContextImpl<LongWritable, Text, Text, Mutation> mapContext = new MapContextImpl<LongWritable, Text, Text, Mutation>(
            conf, id, rr, rw, oc, sr, split);
    // Load data into Mock Accumulo
    Mapper<LongWritable, Text, Text, Mutation>.Context con = mapper.new Context() {
        /**
         * Get the input split for this map.
         */
        public InputSplit getInputSplit() {
            return mapContext.getInputSplit();
        }

        @Override
        public LongWritable getCurrentKey() throws IOException, InterruptedException {
            return mapContext.getCurrentKey();
        }

        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            return mapContext.getCurrentValue();
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return mapContext.nextKeyValue();
        }

        @Override
        public Counter getCounter(Enum<?> counterName) {
            return mapContext.getCounter(counterName);
        }

        @Override
        public Counter getCounter(String groupName, String counterName) {
            return mapContext.getCounter(groupName, counterName);
        }

        @Override
        public OutputCommitter getOutputCommitter() {
            return mapContext.getOutputCommitter();
        }

        @Override
        public void write(Text key, Mutation value) throws IOException, InterruptedException {
            mapContext.write(key, value);
        }

        @Override
        public String getStatus() {
            return mapContext.getStatus();
        }

        @Override
        public TaskAttemptID getTaskAttemptID() {
            return mapContext.getTaskAttemptID();
        }

        @Override
        public void setStatus(String msg) {
            mapContext.setStatus(msg);
        }

        @Override
        public Path[] getArchiveClassPaths() {
            return mapContext.getArchiveClassPaths();
        }

        @Override
        public String[] getArchiveTimestamps() {
            return mapContext.getArchiveTimestamps();
        }

        @Override
        public URI[] getCacheArchives() throws IOException {
            return mapContext.getCacheArchives();
        }

        @Override
        public URI[] getCacheFiles() throws IOException {
            return mapContext.getCacheArchives();
        }

        @Override
        public Class<? extends Reducer<?, ?, ?, ?>> getCombinerClass() throws ClassNotFoundException {
            return mapContext.getCombinerClass();
        }

        @Override
        public Configuration getConfiguration() {
            return mapContext.getConfiguration();
        }

        @Override
        public Path[] getFileClassPaths() {
            return mapContext.getFileClassPaths();
        }

        @Override
        public String[] getFileTimestamps() {
            return mapContext.getFileTimestamps();
        }

        @Override
        public RawComparator<?> getGroupingComparator() {
            return mapContext.getGroupingComparator();
        }

        @Override
        public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException {
            return mapContext.getInputFormatClass();
        }

        @Override
        public String getJar() {
            return mapContext.getJar();
        }

        @Override
        public JobID getJobID() {
            return mapContext.getJobID();
        }

        @Override
        public String getJobName() {
            return mapContext.getJobName();
        }

        /*@Override
        public boolean userClassesTakesPrecedence() {
          return mapContext.userClassesTakesPrecedence();
        }*/

        @Override
        public boolean getJobSetupCleanupNeeded() {
            return mapContext.getJobSetupCleanupNeeded();
        }

        @Override
        public boolean getTaskCleanupNeeded() {
            return mapContext.getTaskCleanupNeeded();
        }

        @Override
        public Path[] getLocalCacheArchives() throws IOException {
            return mapContext.getLocalCacheArchives();
        }

        @Override
        public Path[] getLocalCacheFiles() throws IOException {
            return mapContext.getLocalCacheFiles();
        }

        @Override
        public Class<?> getMapOutputKeyClass() {
            return mapContext.getMapOutputKeyClass();
        }

        @Override
        public Class<?> getMapOutputValueClass() {
            return mapContext.getMapOutputValueClass();
        }

        @Override
        public Class<? extends Mapper<?, ?, ?, ?>> getMapperClass() throws ClassNotFoundException {
            return mapContext.getMapperClass();
        }

        @Override
        public int getMaxMapAttempts() {
            return mapContext.getMaxMapAttempts();
        }

        @Override
        public int getMaxReduceAttempts() {
            return mapContext.getMaxReduceAttempts();
        }

        @Override
        public int getNumReduceTasks() {
            return mapContext.getNumReduceTasks();
        }

        @Override
        public Class<? extends OutputFormat<?, ?>> getOutputFormatClass() throws ClassNotFoundException {
            return mapContext.getOutputFormatClass();
        }

        @Override
        public Class<?> getOutputKeyClass() {
            return mapContext.getOutputKeyClass();
        }

        @Override
        public Class<?> getOutputValueClass() {
            return mapContext.getOutputValueClass();
        }

        @Override
        public Class<? extends Partitioner<?, ?>> getPartitionerClass() throws ClassNotFoundException {
            return mapContext.getPartitionerClass();
        }

        @Override
        public Class<? extends Reducer<?, ?, ?, ?>> getReducerClass() throws ClassNotFoundException {
            return mapContext.getReducerClass();
        }

        @Override
        public RawComparator<?> getSortComparator() {
            return mapContext.getSortComparator();
        }

        @Override
        public boolean getSymlink() {
            return mapContext.getSymlink();
        }

        @Override
        public Path getWorkingDirectory() throws IOException {
            return mapContext.getWorkingDirectory();
        }

        @Override
        public void progress() {
            mapContext.progress();
        }

        @Override
        public boolean getProfileEnabled() {
            return mapContext.getProfileEnabled();
        }

        @Override
        public String getProfileParams() {
            return mapContext.getProfileParams();
        }

        @Override
        public IntegerRanges getProfileTaskRange(boolean isMap) {
            return mapContext.getProfileTaskRange(isMap);
        }

        @Override
        public String getUser() {
            return mapContext.getUser();
        }

        @Override
        public Credentials getCredentials() {
            return mapContext.getCredentials();
        }

        @Override
        public float getProgress() {
            return mapContext.getProgress();
        }
    };

    mapper.run(con);

    // Flush and close record writers.
    rw.close(context);

    table = new QueryLogic();
    table.setMetadataTableName(METADATA_TABLE_NAME);
    table.setTableName(TABLE_NAME);
    table.setIndexTableName(INDEX_TABLE_NAME);
    table.setReverseIndexTableName(RINDEX_TABLE_NAME);
    table.setUseReadAheadIterator(false);
    table.setUnevaluatedFields(Collections.singletonList("TEXT"));
}

From source file:org.apache.flink.hadoopcompatibility.mapreduce.HadoopOutputFormat.java

License:Apache License

/**
 * create the temporary output file for hadoop RecordWriter.
 * @param taskNumber The number of the parallel instance.
 * @param numTasks The number of parallel tasks.
 * @throws IOException/*from w  w  w .  ja v a2 s .com*/
 */
@Override
public void open(int taskNumber, int numTasks) throws IOException {
    if (Integer.toString(taskNumber + 1).length() > 6) {
        throw new IOException("Task id too large.");
    }

    // for hadoop 2.2
    this.configuration.set("mapreduce.output.basename", "tmp");

    TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_"
            + String.format("%" + (6 - Integer.toString(taskNumber + 1).length()) + "s", " ").replace(" ", "0")
            + Integer.toString(taskNumber + 1) + "_0");

    this.configuration.set("mapred.task.id", taskAttemptID.toString());
    this.configuration.setInt("mapred.task.partition", taskNumber + 1);
    // for hadoop 2.2
    this.configuration.set("mapreduce.task.attempt.id", taskAttemptID.toString());
    this.configuration.setInt("mapreduce.task.partition", taskNumber + 1);

    try {
        this.context = HadoopUtils.instantiateTaskAttemptContext(this.configuration, taskAttemptID);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    this.fileOutputCommitter = new FileOutputCommitter(new Path(this.configuration.get("mapred.output.dir")),
            context);

    try {
        this.fileOutputCommitter.setupJob(HadoopUtils.instantiateJobContext(this.configuration, new JobID()));
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    // compatible for hadoop 2.2.0, the temporary output directory is different from hadoop 1.2.1
    this.configuration.set("mapreduce.task.output.dir", this.fileOutputCommitter.getWorkPath().toString());

    try {
        this.recordWriter = this.mapreduceOutputFormat.getRecordWriter(this.context);
    } catch (InterruptedException e) {
        throw new IOException("Could not create RecordWriter.", e);
    }
}

From source file:org.apache.hcatalog.mapreduce.FileOutputFormatContainer.java

License:Apache License

static void setWorkOutputPath(TaskAttemptContext context) throws IOException {
    String outputPath = context.getConfiguration().get("mapred.output.dir");
    //we need to do this to get the task path and set it for mapred implementation
    //since it can't be done automatically because of mapreduce->mapred abstraction
    if (outputPath != null)
        context.getConfiguration().set("mapred.work.output.dir",
                new FileOutputCommitter(new Path(outputPath), context).getWorkPath().toString());
}

From source file:org.apache.hcatalog.mapreduce.FileRecordWriterContainer.java

License:Apache License

@Override
public void write(WritableComparable<?> key, HCatRecord value) throws IOException, InterruptedException {

    org.apache.hadoop.mapred.RecordWriter localWriter;
    ObjectInspector localObjectInspector;
    SerDe localSerDe;/*from w w w.  j ava 2  s .  com*/
    OutputJobInfo localJobInfo = null;

    if (dynamicPartitioningUsed) {
        // calculate which writer to use from the remaining values - this needs to be done before we delete cols
        List<String> dynamicPartValues = new ArrayList<String>();
        for (Integer colToAppend : dynamicPartCols) {
            dynamicPartValues.add(value.get(colToAppend).toString());
        }

        String dynKey = dynamicPartValues.toString();
        if (!baseDynamicWriters.containsKey(dynKey)) {
            if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
                throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS,
                        "Number of dynamic partitions being created "
                                + "exceeds configured max allowable partitions[" + maxDynamicPartitions
                                + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
                                + "] if needed.");
            }

            org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil
                    .createTaskAttemptContext(context);
            configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
            localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext);

            //setup serDe
            SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(),
                    currTaskContext.getJobConf());
            try {
                InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
            } catch (SerDeException e) {
                throw new IOException("Failed to initialize SerDe", e);
            }

            //create base OutputFormat
            org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils
                    .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());

            //We are skipping calling checkOutputSpecs() for each partition
            //As it can throw a FileAlreadyExistsException when more than one mapper is writing to a partition
            //See HCATALOG-490, also to avoid contacting the namenode for each new FileOutputFormat instance
            //In general this should be ok for most FileOutputFormat implementations
            //but may become an issue for cases when the method is used to perform other setup tasks

            //get Output Committer
            org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf()
                    .getOutputCommitter();
            //create currJobContext the latest so it gets all the config changes
            org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil
                    .createJobContext(currTaskContext);
            //setupJob()
            baseOutputCommitter.setupJob(currJobContext);
            //recreate to refresh jobConf of currTask context
            currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(),
                    currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());
            //set temp location
            currTaskContext.getConfiguration().set("mapred.work.output.dir",
                    new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath()
                            .toString());
            //setupTask()
            baseOutputCommitter.setupTask(currTaskContext);

            Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
            Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, "part", ""));

            org.apache.hadoop.mapred.RecordWriter baseRecordWriter = baseOF.getRecordWriter(
                    parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(),
                    childPath.toString(), InternalUtil.createReporter(currTaskContext));

            baseDynamicWriters.put(dynKey, baseRecordWriter);
            baseDynamicSerDe.put(dynKey, currSerDe);
            baseDynamicCommitters.put(dynKey, baseOutputCommitter);
            dynamicContexts.put(dynKey, currTaskContext);
            dynamicObjectInspectors.put(dynKey,
                    InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
            dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey)));
        }

        localJobInfo = dynamicOutputJobInfo.get(dynKey);
        localWriter = baseDynamicWriters.get(dynKey);
        localSerDe = baseDynamicSerDe.get(dynKey);
        localObjectInspector = dynamicObjectInspectors.get(dynKey);
    } else {
        localJobInfo = jobInfo;
        localWriter = getBaseRecordWriter();
        localSerDe = serDe;
        localObjectInspector = objectInspector;
    }

    for (Integer colToDel : partColsToDel) {
        value.remove(colToDel);
    }

    //The key given by user is ignored
    try {
        localWriter.write(NullWritable.get(), localSerDe.serialize(value.getAll(), localObjectInspector));
    } catch (SerDeException e) {
        throw new IOException("Failed to serialize object", e);
    }
}

From source file:org.apache.hive.hcatalog.mapreduce.DynamicPartitionFileRecordWriterContainer.java

License:Apache License

@Override
protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException {
    OutputJobInfo localJobInfo = null;//from www.j  a v a2s .  c  om
    // Calculate which writer to use from the remaining values - this needs to
    // be done before we delete cols.
    List<String> dynamicPartValues = new ArrayList<String>();
    for (Integer colToAppend : dynamicPartCols) {
        Object partitionValue = value.get(colToAppend);
        dynamicPartValues
                .add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString());
    }

    String dynKey = dynamicPartValues.toString();
    if (!baseDynamicWriters.containsKey(dynKey)) {
        if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
            throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS,
                    "Number of dynamic partitions being created "
                            + "exceeds configured max allowable partitions[" + maxDynamicPartitions
                            + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
                            + "] if needed.");
        }

        org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil
                .createTaskAttemptContext(context);
        configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
        localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration());

        // Setup serDe.
        SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(),
                currTaskContext.getJobConf());
        try {
            InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
        } catch (SerDeException e) {
            throw new IOException("Failed to initialize SerDe", e);
        }

        // create base OutputFormat
        org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils
                .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());

        // We are skipping calling checkOutputSpecs() for each partition
        // As it can throw a FileAlreadyExistsException when more than one
        // mapper is writing to a partition.
        // See HCATALOG-490, also to avoid contacting the namenode for each new
        // FileOutputFormat instance.
        // In general this should be ok for most FileOutputFormat implementations
        // but may become an issue for cases when the method is used to perform
        // other setup tasks.

        // Get Output Committer
        org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf()
                .getOutputCommitter();

        // Create currJobContext the latest so it gets all the config changes
        org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext);

        // Set up job.
        baseOutputCommitter.setupJob(currJobContext);

        // Recreate to refresh jobConf of currTask context.
        currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(),
                currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());

        // Set temp location.
        currTaskContext.getConfiguration().set("mapred.work.output.dir",
                new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath()
                        .toString());

        // Set up task.
        baseOutputCommitter.setupTask(currTaskContext);

        Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
        Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext,
                currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), ""));

        RecordWriter baseRecordWriter = baseOF.getRecordWriter(
                parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(),
                childPath.toString(), InternalUtil.createReporter(currTaskContext));

        baseDynamicWriters.put(dynKey, baseRecordWriter);
        baseDynamicSerDe.put(dynKey, currSerDe);
        baseDynamicCommitters.put(dynKey, baseOutputCommitter);
        dynamicContexts.put(dynKey, currTaskContext);
        dynamicObjectInspectors.put(dynKey,
                InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
        dynamicOutputJobInfo.put(dynKey,
                HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration()));
    }

    return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey),
            baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey));
}

From source file:org.apache.phoenix.mapreduce.MultiHfileOutputFormat.java

License:Apache License

/**
 * /*  w ww. ja v a  2  s  .c  o  m*/
 * @param context
 * @return
 * @throws IOException 
 */
static <V extends Cell> RecordWriter<TableRowkeyPair, V> createRecordWriter(final TaskAttemptContext context)
        throws IOException {
    // Get the path of the temporary output file
    final Path outputPath = FileOutputFormat.getOutputPath(context);
    final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
    final Configuration conf = context.getConfiguration();
    final FileSystem fs = outputdir.getFileSystem(conf);

    final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE, HConstants.DEFAULT_MAX_FILE_SIZE);
    // Invented config.  Add to hbase-*.xml if other than default compression.
    final String defaultCompressionStr = conf.get("hfile.compression", Compression.Algorithm.NONE.getName());
    final Algorithm defaultCompression = AbstractHFileWriter.compressionByName(defaultCompressionStr);
    final boolean compactionExclude = conf.getBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude",
            false);

    return new RecordWriter<TableRowkeyPair, V>() {
        // Map of families to writers and how much has been output on the writer.
        private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>(
                Bytes.BYTES_COMPARATOR);
        private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY;
        private final byte[] now = Bytes.toBytes(EnvironmentEdgeManager.currentTimeMillis());
        private boolean rollRequested = false;

        @Override
        public void write(TableRowkeyPair row, V cell) throws IOException {
            KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
            // null input == user explicitly wants to flush
            if (row == null && kv == null) {
                rollWriters();
                return;
            }

            // phoenix-2216: start : extract table name from the rowkey
            String tableName = row.getTableName();
            byte[] rowKey = row.getRowkey().get();
            long length = kv.getLength();
            byte[] family = CellUtil.cloneFamily(kv);
            byte[] tableAndFamily = join(tableName, Bytes.toString(family));
            WriterLength wl = this.writers.get(tableAndFamily);
            // phoenix-2216: end

            // If this is a new column family, verify that the directory exists
            if (wl == null) {
                // phoenix-2216: start : create a directory for table and family within the output dir 
                Path tableOutputPath = CsvBulkImportUtil.getOutputPath(outputdir, tableName);
                fs.mkdirs(new Path(tableOutputPath, Bytes.toString(family)));
                // phoenix-2216: end
            }

            // If any of the HFiles for the column families has reached
            // maxsize, we need to roll all the writers
            if (wl != null && wl.written + length >= maxsize) {
                this.rollRequested = true;
            }

            // This can only happen once a row is finished though
            if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
                rollWriters();
            }

            // create a new WAL writer, if necessary
            if (wl == null || wl.writer == null) {
                // phoenix-2216: start : passed even the table name
                wl = getNewWriter(tableName, family, conf);
                // phoenix-2216: end
            }

            // we now have the proper WAL writer. full steam ahead
            kv.updateLatestStamp(this.now);
            wl.writer.append(kv);
            wl.written += length;

            // Copy the row so we know when a row transition.
            this.previousRow = rowKey;
        }

        private void rollWriters() throws IOException {
            for (WriterLength wl : this.writers.values()) {
                if (wl.writer != null) {
                    LOG.info("Writer=" + wl.writer.getPath()
                            + ((wl.written == 0) ? "" : ", wrote=" + wl.written));
                    close(wl.writer);
                }
                wl.writer = null;
                wl.written = 0;
            }
            this.rollRequested = false;
        }

        /* Create a new StoreFile.Writer.
         * @param family
         * @return A WriterLength, containing a new StoreFile.Writer.
         * @throws IOException
         */
        @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "BX_UNBOXING_IMMEDIATELY_REBOXED", justification = "Not important")
        private WriterLength getNewWriter(final String tableName, byte[] family, Configuration conf)
                throws IOException {

            WriterLength wl = new WriterLength();
            Path tableOutputPath = CsvBulkImportUtil.getOutputPath(outputdir, tableName);
            Path familydir = new Path(tableOutputPath, Bytes.toString(family));

            // phoenix-2216: start : fetching the configuration properties that were set to the table.
            // create a map from column family to the compression algorithm for the table.
            final Map<byte[], Algorithm> compressionMap = createFamilyCompressionMap(conf, tableName);
            final Map<byte[], BloomType> bloomTypeMap = createFamilyBloomTypeMap(conf, tableName);
            final Map<byte[], Integer> blockSizeMap = createFamilyBlockSizeMap(conf, tableName);
            // phoenix-2216: end

            String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_OVERRIDE_CONF_KEY);
            final Map<byte[], DataBlockEncoding> datablockEncodingMap = createFamilyDataBlockEncodingMap(conf,
                    tableName);
            final DataBlockEncoding overriddenEncoding;
            if (dataBlockEncodingStr != null) {
                overriddenEncoding = DataBlockEncoding.valueOf(dataBlockEncodingStr);
            } else {
                overriddenEncoding = null;
            }

            Algorithm compression = compressionMap.get(family);
            compression = compression == null ? defaultCompression : compression;
            BloomType bloomType = bloomTypeMap.get(family);
            bloomType = bloomType == null ? BloomType.NONE : bloomType;
            Integer blockSize = blockSizeMap.get(family);
            blockSize = blockSize == null ? HConstants.DEFAULT_BLOCKSIZE : blockSize;
            DataBlockEncoding encoding = overriddenEncoding;
            encoding = encoding == null ? datablockEncodingMap.get(family) : encoding;
            encoding = encoding == null ? DataBlockEncoding.NONE : encoding;
            Configuration tempConf = new Configuration(conf);
            tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
            HFileContextBuilder contextBuilder = new HFileContextBuilder().withCompression(compression)
                    .withChecksumType(HStore.getChecksumType(conf))
                    .withBytesPerCheckSum(HStore.getBytesPerChecksum(conf)).withBlockSize(blockSize);
            contextBuilder.withDataBlockEncoding(encoding);
            HFileContext hFileContext = contextBuilder.build();

            wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs)
                    .withOutputDir(familydir).withBloomType(bloomType).withComparator(KeyValue.COMPARATOR)
                    .withFileContext(hFileContext).build();

            // join and put it in the writers map .
            // phoenix-2216: start : holds a map of writers where the 
            //                       key in the map is a join byte array of table name and family.
            byte[] tableAndFamily = join(tableName, Bytes.toString(family));
            this.writers.put(tableAndFamily, wl);
            // phoenix-2216: end
            return wl;
        }

        private void close(final StoreFile.Writer w) throws IOException {
            if (w != null) {
                w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,
                        Bytes.toBytes(EnvironmentEdgeManager.currentTimeMillis()));
                w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
                        Bytes.toBytes(context.getTaskAttemptID().toString()));
                w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true));
                w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY, Bytes.toBytes(compactionExclude));
                w.appendTrackedTimestampsToMetadata();
                w.close();
            }
        }

        @Override
        public void close(TaskAttemptContext c) throws IOException, InterruptedException {
            for (WriterLength wl : this.writers.values()) {
                close(wl.writer);
            }
        }
    };
}

From source file:org.apache.tajo.storage.hbase.HBaseStorageManager.java

License:Apache License

@Override
public Path commitOutputData(OverridableConf queryContext, ExecutionBlockId finalEbId, LogicalPlan plan,
        Schema schema, TableDesc tableDesc) throws IOException {
    if (tableDesc == null) {
        throw new IOException("TableDesc is null while calling loadIncrementalHFiles: " + finalEbId);
    }//from w ww  .  j  av  a  2s .co m
    Path stagingDir = new Path(queryContext.get(QueryVars.STAGING_DIR));
    Path stagingResultDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME);

    Configuration hbaseConf = HBaseStorageManager.getHBaseConfiguration(queryContext.getConf(),
            tableDesc.getMeta());
    hbaseConf.set("hbase.loadincremental.threads.max", "2");

    JobContextImpl jobContext = new JobContextImpl(hbaseConf,
            new JobID(finalEbId.getQueryId().toString(), finalEbId.getId()));

    FileOutputCommitter committer = new FileOutputCommitter(stagingResultDir, jobContext);
    Path jobAttemptPath = committer.getJobAttemptPath(jobContext);
    FileSystem fs = jobAttemptPath.getFileSystem(queryContext.getConf());
    if (!fs.exists(jobAttemptPath) || fs.listStatus(jobAttemptPath) == null) {
        LOG.warn("No query attempt file in " + jobAttemptPath);
        return stagingResultDir;
    }
    committer.commitJob(jobContext);

    if (tableDesc.getName() == null && tableDesc.getPath() != null) {

        // insert into location
        return super.commitOutputData(queryContext, finalEbId, plan, schema, tableDesc, false);
    } else {
        // insert into table
        String tableName = tableDesc.getMeta().getOption(HBaseStorageConstants.META_TABLE_KEY);

        HTable htable = new HTable(hbaseConf, tableName);
        try {
            LoadIncrementalHFiles loadIncrementalHFiles = null;
            try {
                loadIncrementalHFiles = new LoadIncrementalHFiles(hbaseConf);
            } catch (Exception e) {
                LOG.error(e.getMessage(), e);
                throw new IOException(e.getMessage(), e);
            }
            loadIncrementalHFiles.doBulkLoad(stagingResultDir, htable);

            return stagingResultDir;
        } finally {
            htable.close();
        }
    }
}