List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter FileOutputCommitter
@Private public FileOutputCommitter(Path outputPath, JobContext context) throws IOException
From source file:mlbench.bayes.train.IndexInstances.java
License:Apache License
@SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException { parseArgs(args);/*from w ww .ja va2 s . co m*/ HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); if (rank == 0) { System.out.println(IndexInstances.class.getSimpleName() + " O start."); createLabelIndex(labPath); } HadoopUtil.cacheFiles(labPath, config); MPI_D.COMM_BIPARTITE_O.Barrier(); OpenObjectIntHashMap<String> labelIndex = BayesUtils.readIndexFromCache(config); if (MPI_D.COMM_BIPARTITE_O != null) { // O communicator int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, (JobConf) config, inDir, rank); for (int i = 0; i < inputs.length; i++) { FileSplit fsplit = inputs[i]; SequenceFileRecordReader<Text, VectorWritable> kvrr = new SequenceFileRecordReader<>(config, fsplit); Text labelText = kvrr.createKey(); VectorWritable instance = kvrr.createValue(); while (kvrr.next(labelText, instance)) { String label = SLASH.split(labelText.toString())[1]; if (labelIndex.containsKey(label)) { MPI_D.Send(new IntWritable(labelIndex.get(label)), instance); } } } } } else if (MPI_D.COMM_BIPARTITE_A != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); config.set(MAPRED_OUTPUT_DIR, outDir); config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString()); ((JobConf) config).setOutputKeyClass(IntWritable.class); ((JobConf) config).setOutputValueClass(VectorWritable.class); TaskAttemptContext taskContext = new TaskAttemptContextImpl(config, DataMPIUtil.getHadoopTaskAttemptID()); SequenceFileOutputFormat<IntWritable, VectorWritable> outfile = new SequenceFileOutputFormat<>(); FileSystem fs = FileSystem.get(config); Path output = new Path(config.get(MAPRED_OUTPUT_DIR)); FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext); RecordWriter<IntWritable, VectorWritable> outrw = null; try { fcommitter.setupJob(taskContext); outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null); } catch (IOException e) { e.printStackTrace(); System.err.println("ERROR: Please set the HDFS configuration properly\n"); System.exit(-1); } IntWritable key = null, newKey = null; VectorWritable point = null, newPoint = null; Vector vector = null; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (IntWritable) vals[0]; newPoint = (VectorWritable) vals[1]; if (key == null && point == null) { } else if (!key.equals(newKey)) { outrw.write(key, new VectorWritable(vector)); vector = null; } if (vector == null) { vector = newPoint.get(); } else { vector.assign(newPoint.get(), Functions.PLUS); } key = newKey; point = newPoint; vals = MPI_D.Recv(); } if (newKey != null && newPoint != null) { outrw.write(key, new VectorWritable(vector)); } outrw.close(null); if (fcommitter.needsTaskCommit(taskContext)) { fcommitter.commitTask(taskContext); } } MPI_D.Finalize(); }
From source file:mlbench.bayes.train.WeightSummer.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException { parseArgs(args);/*from ww w .ja v a 2s . com*/ HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, (JobConf) config, inDir, rank); Vector weightsPerFeature = null; Vector weightsPerLabel = new DenseVector(labNum); for (int i = 0; i < inputs.length; i++) { FileSplit fsplit = inputs[i]; SequenceFileRecordReader<IntWritable, VectorWritable> kvrr = new SequenceFileRecordReader<>(config, fsplit); IntWritable index = kvrr.createKey(); VectorWritable value = kvrr.createValue(); while (kvrr.next(index, value)) { Vector instance = value.get(); if (weightsPerFeature == null) { weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements()); } int label = index.get(); weightsPerFeature.assign(instance, Functions.PLUS); weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum()); } } if (weightsPerFeature != null) { MPI_D.Send(new Text(WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature)); MPI_D.Send(new Text(WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel)); } } else if (MPI_D.COMM_BIPARTITE_A != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); config.set(MAPRED_OUTPUT_DIR, outDirW); config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString()); ((JobConf) config).setOutputKeyClass(Text.class); ((JobConf) config).setOutputValueClass(VectorWritable.class); TaskAttemptContext taskContext = new TaskAttemptContextImpl(config, DataMPIUtil.getHadoopTaskAttemptID()); SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>(); FileSystem fs = FileSystem.get(config); Path output = new Path(config.get(MAPRED_OUTPUT_DIR)); FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext); RecordWriter<Text, VectorWritable> outrw = null; try { fcommitter.setupJob(taskContext); outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null); } catch (IOException e) { e.printStackTrace(); System.err.println("ERROR: Please set the HDFS configuration properly\n"); System.exit(-1); } Text key = null, newKey = null; VectorWritable point = null, newPoint = null; Vector vector = null; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (Text) vals[0]; newPoint = (VectorWritable) vals[1]; if (key == null && point == null) { } else if (!key.equals(newKey)) { outrw.write(key, new VectorWritable(vector)); vector = null; } if (vector == null) { vector = newPoint.get(); } else { vector.assign(newPoint.get(), Functions.PLUS); } key = newKey; point = newPoint; vals = MPI_D.Recv(); } if (newKey != null && newPoint != null) { outrw.write(key, new VectorWritable(vector)); } outrw.close(null); if (fcommitter.needsTaskCommit(taskContext)) { fcommitter.commitTask(taskContext); } MPI_D.COMM_BIPARTITE_A.Barrier(); if (rank == 0) { Path resOut = new Path(outDir); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path(outDir), config); naiveBayesModel.serialize(resOut, config); } } MPI_D.Finalize(); }
From source file:org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapperTest.java
License:Apache License
@Before public void setup() throws Exception { conf.set(AggregatingRecordReader.START_TOKEN, "<page>"); conf.set(AggregatingRecordReader.END_TOKEN, "</page>"); conf.set(WikipediaConfiguration.TABLE_NAME, TABLE_NAME); conf.set(WikipediaConfiguration.NUM_PARTITIONS, "1"); conf.set(WikipediaConfiguration.NUM_GROUPS, "1"); MockInstance i = new MockInstance(); c = i.getConnector("root", "pass"); c.tableOperations().delete(METADATA_TABLE_NAME); c.tableOperations().delete(TABLE_NAME); c.tableOperations().delete(INDEX_TABLE_NAME); c.tableOperations().delete(RINDEX_TABLE_NAME); c.tableOperations().create(METADATA_TABLE_NAME); c.tableOperations().create(TABLE_NAME); c.tableOperations().create(INDEX_TABLE_NAME); c.tableOperations().create(RINDEX_TABLE_NAME); writerMap.put(new Text(METADATA_TABLE_NAME), c.createBatchWriter(METADATA_TABLE_NAME, 1000L, 1000L, 1)); writerMap.put(new Text(TABLE_NAME), c.createBatchWriter(TABLE_NAME, 1000L, 1000L, 1)); writerMap.put(new Text(INDEX_TABLE_NAME), c.createBatchWriter(INDEX_TABLE_NAME, 1000L, 1000L, 1)); writerMap.put(new Text(RINDEX_TABLE_NAME), c.createBatchWriter(RINDEX_TABLE_NAME, 1000L, 1000L, 1)); TaskAttemptID id = new TaskAttemptID(); TaskAttemptContext context = new TaskAttemptContext(conf, id); RawLocalFileSystem fs = new RawLocalFileSystem(); fs.setConf(conf);//from w w w.j a v a 2 s.c om URL url = ClassLoader.getSystemResource("enwiki-20110901-001.xml"); Assert.assertNotNull(url); File data = new File(url.toURI()); Path tmpFile = new Path(data.getAbsolutePath()); // Setup the Mapper InputSplit split = new FileSplit(tmpFile, 0, fs.pathToFile(tmpFile).length(), null); AggregatingRecordReader rr = new AggregatingRecordReader(); Path ocPath = new Path(tmpFile, "oc"); OutputCommitter oc = new FileOutputCommitter(ocPath, context); fs.deleteOnExit(ocPath); StandaloneStatusReporter sr = new StandaloneStatusReporter(); rr.initialize(split, context); MockAccumuloRecordWriter rw = new MockAccumuloRecordWriter(); WikipediaMapper mapper = new WikipediaMapper(); // Load data into Mock Accumulo Mapper<LongWritable, Text, Text, Mutation>.Context con = mapper.new Context(conf, id, rr, rw, oc, sr, split); mapper.run(con); // Flush and close record writers. rw.close(context); }
From source file:org.apache.accumulo.examples.wikisearch.logic.TestQueryLogic.java
License:Apache License
@Before public void setup() throws Exception { Logger.getLogger(AbstractQueryLogic.class).setLevel(Level.DEBUG); Logger.getLogger(QueryLogic.class).setLevel(Level.DEBUG); Logger.getLogger(RangeCalculator.class).setLevel(Level.DEBUG); conf.set(AggregatingRecordReader.START_TOKEN, "<page>"); conf.set(AggregatingRecordReader.END_TOKEN, "</page>"); conf.set(WikipediaConfiguration.TABLE_NAME, TABLE_NAME); conf.set(WikipediaConfiguration.NUM_PARTITIONS, "1"); conf.set(WikipediaConfiguration.NUM_GROUPS, "1"); MockInstance i = new MockInstance(); c = i.getConnector("root", new PasswordToken("")); WikipediaIngester.createTables(c.tableOperations(), TABLE_NAME, false); for (String table : TABLE_NAMES) { writerMap.put(new Text(table), c.createBatchWriter(table, 1000L, 1000L, 1)); }/*from www . jav a2s .co m*/ TaskAttemptID id = new TaskAttemptID("fake", 1, TaskType.MAP, 1, 1); TaskAttemptContext context = new TaskAttemptContextImpl(conf, id); RawLocalFileSystem fs = new RawLocalFileSystem(); fs.setConf(conf); URL url = ClassLoader.getSystemResource("enwiki-20110901-001.xml"); Assert.assertNotNull(url); File data = new File(url.toURI()); Path tmpFile = new Path(data.getAbsolutePath()); // Setup the Mapper WikipediaInputSplit split = new WikipediaInputSplit( new FileSplit(tmpFile, 0, fs.pathToFile(tmpFile).length(), null), 0); AggregatingRecordReader rr = new AggregatingRecordReader(); Path ocPath = new Path(tmpFile, "oc"); OutputCommitter oc = new FileOutputCommitter(ocPath, context); fs.deleteOnExit(ocPath); StandaloneStatusReporter sr = new StandaloneStatusReporter(); rr.initialize(split, context); MockAccumuloRecordWriter rw = new MockAccumuloRecordWriter(); WikipediaMapper mapper = new WikipediaMapper(); // there are times I wonder, "Why do Java people think this is good?" then I drink more whiskey final MapContextImpl<LongWritable, Text, Text, Mutation> mapContext = new MapContextImpl<LongWritable, Text, Text, Mutation>( conf, id, rr, rw, oc, sr, split); // Load data into Mock Accumulo Mapper<LongWritable, Text, Text, Mutation>.Context con = mapper.new Context() { /** * Get the input split for this map. */ public InputSplit getInputSplit() { return mapContext.getInputSplit(); } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return mapContext.getCurrentKey(); } @Override public Text getCurrentValue() throws IOException, InterruptedException { return mapContext.getCurrentValue(); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return mapContext.nextKeyValue(); } @Override public Counter getCounter(Enum<?> counterName) { return mapContext.getCounter(counterName); } @Override public Counter getCounter(String groupName, String counterName) { return mapContext.getCounter(groupName, counterName); } @Override public OutputCommitter getOutputCommitter() { return mapContext.getOutputCommitter(); } @Override public void write(Text key, Mutation value) throws IOException, InterruptedException { mapContext.write(key, value); } @Override public String getStatus() { return mapContext.getStatus(); } @Override public TaskAttemptID getTaskAttemptID() { return mapContext.getTaskAttemptID(); } @Override public void setStatus(String msg) { mapContext.setStatus(msg); } @Override public Path[] getArchiveClassPaths() { return mapContext.getArchiveClassPaths(); } @Override public String[] getArchiveTimestamps() { return mapContext.getArchiveTimestamps(); } @Override public URI[] getCacheArchives() throws IOException { return mapContext.getCacheArchives(); } @Override public URI[] getCacheFiles() throws IOException { return mapContext.getCacheArchives(); } @Override public Class<? extends Reducer<?, ?, ?, ?>> getCombinerClass() throws ClassNotFoundException { return mapContext.getCombinerClass(); } @Override public Configuration getConfiguration() { return mapContext.getConfiguration(); } @Override public Path[] getFileClassPaths() { return mapContext.getFileClassPaths(); } @Override public String[] getFileTimestamps() { return mapContext.getFileTimestamps(); } @Override public RawComparator<?> getGroupingComparator() { return mapContext.getGroupingComparator(); } @Override public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException { return mapContext.getInputFormatClass(); } @Override public String getJar() { return mapContext.getJar(); } @Override public JobID getJobID() { return mapContext.getJobID(); } @Override public String getJobName() { return mapContext.getJobName(); } /*@Override public boolean userClassesTakesPrecedence() { return mapContext.userClassesTakesPrecedence(); }*/ @Override public boolean getJobSetupCleanupNeeded() { return mapContext.getJobSetupCleanupNeeded(); } @Override public boolean getTaskCleanupNeeded() { return mapContext.getTaskCleanupNeeded(); } @Override public Path[] getLocalCacheArchives() throws IOException { return mapContext.getLocalCacheArchives(); } @Override public Path[] getLocalCacheFiles() throws IOException { return mapContext.getLocalCacheFiles(); } @Override public Class<?> getMapOutputKeyClass() { return mapContext.getMapOutputKeyClass(); } @Override public Class<?> getMapOutputValueClass() { return mapContext.getMapOutputValueClass(); } @Override public Class<? extends Mapper<?, ?, ?, ?>> getMapperClass() throws ClassNotFoundException { return mapContext.getMapperClass(); } @Override public int getMaxMapAttempts() { return mapContext.getMaxMapAttempts(); } @Override public int getMaxReduceAttempts() { return mapContext.getMaxReduceAttempts(); } @Override public int getNumReduceTasks() { return mapContext.getNumReduceTasks(); } @Override public Class<? extends OutputFormat<?, ?>> getOutputFormatClass() throws ClassNotFoundException { return mapContext.getOutputFormatClass(); } @Override public Class<?> getOutputKeyClass() { return mapContext.getOutputKeyClass(); } @Override public Class<?> getOutputValueClass() { return mapContext.getOutputValueClass(); } @Override public Class<? extends Partitioner<?, ?>> getPartitionerClass() throws ClassNotFoundException { return mapContext.getPartitionerClass(); } @Override public Class<? extends Reducer<?, ?, ?, ?>> getReducerClass() throws ClassNotFoundException { return mapContext.getReducerClass(); } @Override public RawComparator<?> getSortComparator() { return mapContext.getSortComparator(); } @Override public boolean getSymlink() { return mapContext.getSymlink(); } @Override public Path getWorkingDirectory() throws IOException { return mapContext.getWorkingDirectory(); } @Override public void progress() { mapContext.progress(); } @Override public boolean getProfileEnabled() { return mapContext.getProfileEnabled(); } @Override public String getProfileParams() { return mapContext.getProfileParams(); } @Override public IntegerRanges getProfileTaskRange(boolean isMap) { return mapContext.getProfileTaskRange(isMap); } @Override public String getUser() { return mapContext.getUser(); } @Override public Credentials getCredentials() { return mapContext.getCredentials(); } @Override public float getProgress() { return mapContext.getProgress(); } }; mapper.run(con); // Flush and close record writers. rw.close(context); table = new QueryLogic(); table.setMetadataTableName(METADATA_TABLE_NAME); table.setTableName(TABLE_NAME); table.setIndexTableName(INDEX_TABLE_NAME); table.setReverseIndexTableName(RINDEX_TABLE_NAME); table.setUseReadAheadIterator(false); table.setUnevaluatedFields(Collections.singletonList("TEXT")); }
From source file:org.apache.flink.hadoopcompatibility.mapreduce.HadoopOutputFormat.java
License:Apache License
/** * create the temporary output file for hadoop RecordWriter. * @param taskNumber The number of the parallel instance. * @param numTasks The number of parallel tasks. * @throws IOException/*from w w w . ja v a2 s .com*/ */ @Override public void open(int taskNumber, int numTasks) throws IOException { if (Integer.toString(taskNumber + 1).length() > 6) { throw new IOException("Task id too large."); } // for hadoop 2.2 this.configuration.set("mapreduce.output.basename", "tmp"); TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_" + String.format("%" + (6 - Integer.toString(taskNumber + 1).length()) + "s", " ").replace(" ", "0") + Integer.toString(taskNumber + 1) + "_0"); this.configuration.set("mapred.task.id", taskAttemptID.toString()); this.configuration.setInt("mapred.task.partition", taskNumber + 1); // for hadoop 2.2 this.configuration.set("mapreduce.task.attempt.id", taskAttemptID.toString()); this.configuration.setInt("mapreduce.task.partition", taskNumber + 1); try { this.context = HadoopUtils.instantiateTaskAttemptContext(this.configuration, taskAttemptID); } catch (Exception e) { throw new RuntimeException(e); } this.fileOutputCommitter = new FileOutputCommitter(new Path(this.configuration.get("mapred.output.dir")), context); try { this.fileOutputCommitter.setupJob(HadoopUtils.instantiateJobContext(this.configuration, new JobID())); } catch (Exception e) { throw new RuntimeException(e); } // compatible for hadoop 2.2.0, the temporary output directory is different from hadoop 1.2.1 this.configuration.set("mapreduce.task.output.dir", this.fileOutputCommitter.getWorkPath().toString()); try { this.recordWriter = this.mapreduceOutputFormat.getRecordWriter(this.context); } catch (InterruptedException e) { throw new IOException("Could not create RecordWriter.", e); } }
From source file:org.apache.hcatalog.mapreduce.FileOutputFormatContainer.java
License:Apache License
static void setWorkOutputPath(TaskAttemptContext context) throws IOException { String outputPath = context.getConfiguration().get("mapred.output.dir"); //we need to do this to get the task path and set it for mapred implementation //since it can't be done automatically because of mapreduce->mapred abstraction if (outputPath != null) context.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(outputPath), context).getWorkPath().toString()); }
From source file:org.apache.hcatalog.mapreduce.FileRecordWriterContainer.java
License:Apache License
@Override public void write(WritableComparable<?> key, HCatRecord value) throws IOException, InterruptedException { org.apache.hadoop.mapred.RecordWriter localWriter; ObjectInspector localObjectInspector; SerDe localSerDe;/*from w w w. j ava 2 s . com*/ OutputJobInfo localJobInfo = null; if (dynamicPartitioningUsed) { // calculate which writer to use from the remaining values - this needs to be done before we delete cols List<String> dynamicPartValues = new ArrayList<String>(); for (Integer colToAppend : dynamicPartCols) { dynamicPartValues.add(value.get(colToAppend).toString()); } String dynKey = dynamicPartValues.toString(); if (!baseDynamicWriters.containsKey(dynKey)) { if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) { throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed."); } org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil .createTaskAttemptContext(context); configureDynamicStorageHandler(currTaskContext, dynamicPartValues); localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext); //setup serDe SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf()); try { InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo); } catch (SerDeException e) { throw new IOException("Failed to initialize SerDe", e); } //create base OutputFormat org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf()); //We are skipping calling checkOutputSpecs() for each partition //As it can throw a FileAlreadyExistsException when more than one mapper is writing to a partition //See HCATALOG-490, also to avoid contacting the namenode for each new FileOutputFormat instance //In general this should be ok for most FileOutputFormat implementations //but may become an issue for cases when the method is used to perform other setup tasks //get Output Committer org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf() .getOutputCommitter(); //create currJobContext the latest so it gets all the config changes org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil .createJobContext(currTaskContext); //setupJob() baseOutputCommitter.setupJob(currJobContext); //recreate to refresh jobConf of currTask context currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible()); //set temp location currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath() .toString()); //setupTask() baseOutputCommitter.setupTask(currTaskContext); Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir")); Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, "part", "")); org.apache.hadoop.mapred.RecordWriter baseRecordWriter = baseOF.getRecordWriter( parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext)); baseDynamicWriters.put(dynKey, baseRecordWriter); baseDynamicSerDe.put(dynKey, currSerDe); baseDynamicCommitters.put(dynKey, baseOutputCommitter); dynamicContexts.put(dynKey, currTaskContext); dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema())); dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey))); } localJobInfo = dynamicOutputJobInfo.get(dynKey); localWriter = baseDynamicWriters.get(dynKey); localSerDe = baseDynamicSerDe.get(dynKey); localObjectInspector = dynamicObjectInspectors.get(dynKey); } else { localJobInfo = jobInfo; localWriter = getBaseRecordWriter(); localSerDe = serDe; localObjectInspector = objectInspector; } for (Integer colToDel : partColsToDel) { value.remove(colToDel); } //The key given by user is ignored try { localWriter.write(NullWritable.get(), localSerDe.serialize(value.getAll(), localObjectInspector)); } catch (SerDeException e) { throw new IOException("Failed to serialize object", e); } }
From source file:org.apache.hive.hcatalog.mapreduce.DynamicPartitionFileRecordWriterContainer.java
License:Apache License
@Override protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException { OutputJobInfo localJobInfo = null;//from www.j a v a2s . c om // Calculate which writer to use from the remaining values - this needs to // be done before we delete cols. List<String> dynamicPartValues = new ArrayList<String>(); for (Integer colToAppend : dynamicPartCols) { Object partitionValue = value.get(colToAppend); dynamicPartValues .add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString()); } String dynKey = dynamicPartValues.toString(); if (!baseDynamicWriters.containsKey(dynKey)) { if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) { throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed."); } org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil .createTaskAttemptContext(context); configureDynamicStorageHandler(currTaskContext, dynamicPartValues); localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration()); // Setup serDe. SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf()); try { InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo); } catch (SerDeException e) { throw new IOException("Failed to initialize SerDe", e); } // create base OutputFormat org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf()); // We are skipping calling checkOutputSpecs() for each partition // As it can throw a FileAlreadyExistsException when more than one // mapper is writing to a partition. // See HCATALOG-490, also to avoid contacting the namenode for each new // FileOutputFormat instance. // In general this should be ok for most FileOutputFormat implementations // but may become an issue for cases when the method is used to perform // other setup tasks. // Get Output Committer org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf() .getOutputCommitter(); // Create currJobContext the latest so it gets all the config changes org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext); // Set up job. baseOutputCommitter.setupJob(currJobContext); // Recreate to refresh jobConf of currTask context. currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible()); // Set temp location. currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath() .toString()); // Set up task. baseOutputCommitter.setupTask(currTaskContext); Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir")); Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), "")); RecordWriter baseRecordWriter = baseOF.getRecordWriter( parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext)); baseDynamicWriters.put(dynKey, baseRecordWriter); baseDynamicSerDe.put(dynKey, currSerDe); baseDynamicCommitters.put(dynKey, baseOutputCommitter); dynamicContexts.put(dynKey, currTaskContext); dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema())); dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration())); } return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey), baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey)); }
From source file:org.apache.phoenix.mapreduce.MultiHfileOutputFormat.java
License:Apache License
/** * /* w ww. ja v a 2 s .c o m*/ * @param context * @return * @throws IOException */ static <V extends Cell> RecordWriter<TableRowkeyPair, V> createRecordWriter(final TaskAttemptContext context) throws IOException { // Get the path of the temporary output file final Path outputPath = FileOutputFormat.getOutputPath(context); final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath(); final Configuration conf = context.getConfiguration(); final FileSystem fs = outputdir.getFileSystem(conf); final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE, HConstants.DEFAULT_MAX_FILE_SIZE); // Invented config. Add to hbase-*.xml if other than default compression. final String defaultCompressionStr = conf.get("hfile.compression", Compression.Algorithm.NONE.getName()); final Algorithm defaultCompression = AbstractHFileWriter.compressionByName(defaultCompressionStr); final boolean compactionExclude = conf.getBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude", false); return new RecordWriter<TableRowkeyPair, V>() { // Map of families to writers and how much has been output on the writer. private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>( Bytes.BYTES_COMPARATOR); private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY; private final byte[] now = Bytes.toBytes(EnvironmentEdgeManager.currentTimeMillis()); private boolean rollRequested = false; @Override public void write(TableRowkeyPair row, V cell) throws IOException { KeyValue kv = KeyValueUtil.ensureKeyValue(cell); // null input == user explicitly wants to flush if (row == null && kv == null) { rollWriters(); return; } // phoenix-2216: start : extract table name from the rowkey String tableName = row.getTableName(); byte[] rowKey = row.getRowkey().get(); long length = kv.getLength(); byte[] family = CellUtil.cloneFamily(kv); byte[] tableAndFamily = join(tableName, Bytes.toString(family)); WriterLength wl = this.writers.get(tableAndFamily); // phoenix-2216: end // If this is a new column family, verify that the directory exists if (wl == null) { // phoenix-2216: start : create a directory for table and family within the output dir Path tableOutputPath = CsvBulkImportUtil.getOutputPath(outputdir, tableName); fs.mkdirs(new Path(tableOutputPath, Bytes.toString(family))); // phoenix-2216: end } // If any of the HFiles for the column families has reached // maxsize, we need to roll all the writers if (wl != null && wl.written + length >= maxsize) { this.rollRequested = true; } // This can only happen once a row is finished though if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) { rollWriters(); } // create a new WAL writer, if necessary if (wl == null || wl.writer == null) { // phoenix-2216: start : passed even the table name wl = getNewWriter(tableName, family, conf); // phoenix-2216: end } // we now have the proper WAL writer. full steam ahead kv.updateLatestStamp(this.now); wl.writer.append(kv); wl.written += length; // Copy the row so we know when a row transition. this.previousRow = rowKey; } private void rollWriters() throws IOException { for (WriterLength wl : this.writers.values()) { if (wl.writer != null) { LOG.info("Writer=" + wl.writer.getPath() + ((wl.written == 0) ? "" : ", wrote=" + wl.written)); close(wl.writer); } wl.writer = null; wl.written = 0; } this.rollRequested = false; } /* Create a new StoreFile.Writer. * @param family * @return A WriterLength, containing a new StoreFile.Writer. * @throws IOException */ @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "BX_UNBOXING_IMMEDIATELY_REBOXED", justification = "Not important") private WriterLength getNewWriter(final String tableName, byte[] family, Configuration conf) throws IOException { WriterLength wl = new WriterLength(); Path tableOutputPath = CsvBulkImportUtil.getOutputPath(outputdir, tableName); Path familydir = new Path(tableOutputPath, Bytes.toString(family)); // phoenix-2216: start : fetching the configuration properties that were set to the table. // create a map from column family to the compression algorithm for the table. final Map<byte[], Algorithm> compressionMap = createFamilyCompressionMap(conf, tableName); final Map<byte[], BloomType> bloomTypeMap = createFamilyBloomTypeMap(conf, tableName); final Map<byte[], Integer> blockSizeMap = createFamilyBlockSizeMap(conf, tableName); // phoenix-2216: end String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_OVERRIDE_CONF_KEY); final Map<byte[], DataBlockEncoding> datablockEncodingMap = createFamilyDataBlockEncodingMap(conf, tableName); final DataBlockEncoding overriddenEncoding; if (dataBlockEncodingStr != null) { overriddenEncoding = DataBlockEncoding.valueOf(dataBlockEncodingStr); } else { overriddenEncoding = null; } Algorithm compression = compressionMap.get(family); compression = compression == null ? defaultCompression : compression; BloomType bloomType = bloomTypeMap.get(family); bloomType = bloomType == null ? BloomType.NONE : bloomType; Integer blockSize = blockSizeMap.get(family); blockSize = blockSize == null ? HConstants.DEFAULT_BLOCKSIZE : blockSize; DataBlockEncoding encoding = overriddenEncoding; encoding = encoding == null ? datablockEncodingMap.get(family) : encoding; encoding = encoding == null ? DataBlockEncoding.NONE : encoding; Configuration tempConf = new Configuration(conf); tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f); HFileContextBuilder contextBuilder = new HFileContextBuilder().withCompression(compression) .withChecksumType(HStore.getChecksumType(conf)) .withBytesPerCheckSum(HStore.getBytesPerChecksum(conf)).withBlockSize(blockSize); contextBuilder.withDataBlockEncoding(encoding); HFileContext hFileContext = contextBuilder.build(); wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs) .withOutputDir(familydir).withBloomType(bloomType).withComparator(KeyValue.COMPARATOR) .withFileContext(hFileContext).build(); // join and put it in the writers map . // phoenix-2216: start : holds a map of writers where the // key in the map is a join byte array of table name and family. byte[] tableAndFamily = join(tableName, Bytes.toString(family)); this.writers.put(tableAndFamily, wl); // phoenix-2216: end return wl; } private void close(final StoreFile.Writer w) throws IOException { if (w != null) { w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(EnvironmentEdgeManager.currentTimeMillis())); w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString())); w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true)); w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY, Bytes.toBytes(compactionExclude)); w.appendTrackedTimestampsToMetadata(); w.close(); } } @Override public void close(TaskAttemptContext c) throws IOException, InterruptedException { for (WriterLength wl : this.writers.values()) { close(wl.writer); } } }; }
From source file:org.apache.tajo.storage.hbase.HBaseStorageManager.java
License:Apache License
@Override public Path commitOutputData(OverridableConf queryContext, ExecutionBlockId finalEbId, LogicalPlan plan, Schema schema, TableDesc tableDesc) throws IOException { if (tableDesc == null) { throw new IOException("TableDesc is null while calling loadIncrementalHFiles: " + finalEbId); }//from w ww . j av a 2s .co m Path stagingDir = new Path(queryContext.get(QueryVars.STAGING_DIR)); Path stagingResultDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME); Configuration hbaseConf = HBaseStorageManager.getHBaseConfiguration(queryContext.getConf(), tableDesc.getMeta()); hbaseConf.set("hbase.loadincremental.threads.max", "2"); JobContextImpl jobContext = new JobContextImpl(hbaseConf, new JobID(finalEbId.getQueryId().toString(), finalEbId.getId())); FileOutputCommitter committer = new FileOutputCommitter(stagingResultDir, jobContext); Path jobAttemptPath = committer.getJobAttemptPath(jobContext); FileSystem fs = jobAttemptPath.getFileSystem(queryContext.getConf()); if (!fs.exists(jobAttemptPath) || fs.listStatus(jobAttemptPath) == null) { LOG.warn("No query attempt file in " + jobAttemptPath); return stagingResultDir; } committer.commitJob(jobContext); if (tableDesc.getName() == null && tableDesc.getPath() != null) { // insert into location return super.commitOutputData(queryContext, finalEbId, plan, schema, tableDesc, false); } else { // insert into table String tableName = tableDesc.getMeta().getOption(HBaseStorageConstants.META_TABLE_KEY); HTable htable = new HTable(hbaseConf, tableName); try { LoadIncrementalHFiles loadIncrementalHFiles = null; try { loadIncrementalHFiles = new LoadIncrementalHFiles(hbaseConf); } catch (Exception e) { LOG.error(e.getMessage(), e); throw new IOException(e.getMessage(), e); } loadIncrementalHFiles.doBulkLoad(stagingResultDir, htable); return stagingResultDir; } finally { htable.close(); } } }