List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat getUniqueFile
public synchronized static String getUniqueFile(TaskAttemptContext context, String name, String extension)
From source file:com.asakusafw.runtime.stage.output.TemporaryOutputFormat.java
License:Apache License
/** * Creates a new {@link RecordWriter} to output temporary data. * @param <V> value type/* www. j a v a 2s . c om*/ * @param context current context * @param name output name * @param dataType value type * @return the created writer * @throws IOException if failed to create a new {@link RecordWriter} * @throws InterruptedException if interrupted * @throws IllegalArgumentException if some parameters were {@code null} */ public <V> RecordWriter<NullWritable, V> createRecordWriter(TaskAttemptContext context, String name, Class<V> dataType) throws IOException, InterruptedException { if (context == null) { throw new IllegalArgumentException("context must not be null"); //$NON-NLS-1$ } if (name == null) { throw new IllegalArgumentException("name must not be null"); //$NON-NLS-1$ } if (dataType == null) { throw new IllegalArgumentException("dataType must not be null"); //$NON-NLS-1$ } CompressionCodec codec = null; Configuration conf = context.getConfiguration(); if (FileOutputFormat.getCompressOutput(context)) { Class<?> codecClass = FileOutputFormat.getOutputCompressorClass(context, DefaultCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); } FileOutputCommitter committer = getOutputCommitter(context); final Path file = new Path(committer.getWorkPath(), FileOutputFormat.getUniqueFile(context, name, "")); //$NON-NLS-1$ final ModelOutput<V> out = TemporaryStorage.openOutput(conf, dataType, file, codec); return new RecordWriter<NullWritable, V>() { @Override public void write(NullWritable key, V value) throws IOException { out.write(value); } @Override public void close(TaskAttemptContext ignored) throws IOException { out.close(); } @Override public String toString() { return String.format("TemporaryOutput(%s)", file); //$NON-NLS-1$ } }; }
From source file:com.linkedin.cubert.examples.Purge.java
License:Open Source License
@Override public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException, InterruptedException { block = input.values().iterator().next(); conf = PhaseContext.getConf();/*from w w w. ja v a2 s . com*/ output = TupleFactory.getInstance().newTuple(3); purgeFileName = FileCache.get(filesToCache.get(0)); if (purgeFileName == null) { throw new IOException("purgeFileName is null"); } loadMembersToPurge(purgeFileName); String columnName = JsonUtils.getText(json.get("args"), "purgeColumnName"); setColumnName(columnName); // Create temp file Path root = null; String filename = null; tempFileName = null; if (PhaseContext.isMapper()) { root = FileOutputFormat.getWorkOutputPath(PhaseContext.getMapContext()); filename = FileOutputFormat.getUniqueFile(PhaseContext.getMapContext(), "tempFileForPurge", ""); } else { root = FileOutputFormat.getWorkOutputPath(PhaseContext.getRedContext()); filename = FileOutputFormat.getUniqueFile(PhaseContext.getRedContext(), "tempFileForPurge", ""); } tempFileName = root + "/" + filename; }
From source file:com.linkedin.cubert.operator.TeeOperator.java
License:Open Source License
@Override public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException, InterruptedException { block = input.values().iterator().next(); String prefix = JsonUtils.getText(json, "prefix"); BlockSchema teeSchema = new BlockSchema(json.get("teeSchema")); if (json.has("generate") && !json.get("generate").isNull()) { ObjectNode generateJson = JsonUtils.createObjectNode("name", "GENERATE", "input", json.get("input"), "output", json.get("input"), "outputTuple", json.get("generate")); generateOperator = new GenerateOperator(); BlockProperties generateProps = new BlockProperties("teeGenerate", teeSchema, props); generateOperator.setInput(input, generateJson, generateProps); }//from w w w.j a va2 s . c o m Configuration conf = PhaseContext.getConf(); Path root = null; String filename = null; if (PhaseContext.isMapper()) { root = FileOutputFormat.getWorkOutputPath(PhaseContext.getMapContext()); filename = FileOutputFormat.getUniqueFile(PhaseContext.getMapContext(), prefix, ""); } else { root = FileOutputFormat.getWorkOutputPath(PhaseContext.getRedContext()); filename = FileOutputFormat.getUniqueFile(PhaseContext.getRedContext(), prefix, ""); } writer = openedWriters.get(prefix); if (writer == null) { writer = StorageFactory.get(JsonUtils.getText(json, "type")).getTeeWriter(); writer.open(conf, json, teeSchema, root, filename); openedWriters.put(prefix, writer); } if (json.has("filter") && json.get("filter") != null && !json.get("filter").isNull()) { JsonNode filterJson = json.get("filter"); filterTree = new FunctionTree(block); try { filterTree.addFunctionTree(filterJson); } catch (PreconditionException e) { throw new RuntimeException(e); } } }
From source file:org.apache.crunch.io.avro.trevni.TrevniRecordWriter.java
License:Apache License
public TrevniRecordWriter(TaskAttemptContext context) throws IOException { schema = initSchema(context);//from ww w . j av a 2 s.c o m meta = filterMetadata(context.getConfiguration()); writer = new AvroColumnWriter<T>(schema, meta, ReflectData.get()); Path outputPath = FileOutputFormat.getOutputPath(context); String dir = FileOutputFormat.getUniqueFile(context, "part", ""); dirPath = new Path(outputPath.toString() + "/" + dir); fs = dirPath.getFileSystem(context.getConfiguration()); fs.mkdirs(dirPath); blockSize = fs.getDefaultBlockSize(); }
From source file:org.apache.gora.mapreduce.GoraOutputFormat.java
License:Apache License
private void setOutputPath(DataStore<K, T> store, TaskAttemptContext context) { if (store instanceof FileBackedDataStore) { FileBackedDataStore<K, T> fileStore = (FileBackedDataStore<K, T>) store; String uniqueName = FileOutputFormat.getUniqueFile(context, "part", ""); //if file store output is not set, then get the output from FileOutputFormat if (fileStore.getOutputPath() == null) { fileStore.setOutputPath(FileOutputFormat.getOutputPath(context).toString()); }//from www .j av a2 s . c o m //set the unique name of the data file String path = fileStore.getOutputPath(); fileStore.setOutputPath(path + Path.SEPARATOR + uniqueName); } }
From source file:org.apache.hcatalog.mapreduce.FileRecordWriterContainer.java
License:Apache License
@Override public void write(WritableComparable<?> key, HCatRecord value) throws IOException, InterruptedException { org.apache.hadoop.mapred.RecordWriter localWriter; ObjectInspector localObjectInspector; SerDe localSerDe;/*from www. j a v a 2 s . c om*/ OutputJobInfo localJobInfo = null; if (dynamicPartitioningUsed) { // calculate which writer to use from the remaining values - this needs to be done before we delete cols List<String> dynamicPartValues = new ArrayList<String>(); for (Integer colToAppend : dynamicPartCols) { dynamicPartValues.add(value.get(colToAppend).toString()); } String dynKey = dynamicPartValues.toString(); if (!baseDynamicWriters.containsKey(dynKey)) { if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) { throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed."); } org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil .createTaskAttemptContext(context); configureDynamicStorageHandler(currTaskContext, dynamicPartValues); localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext); //setup serDe SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf()); try { InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo); } catch (SerDeException e) { throw new IOException("Failed to initialize SerDe", e); } //create base OutputFormat org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf()); //We are skipping calling checkOutputSpecs() for each partition //As it can throw a FileAlreadyExistsException when more than one mapper is writing to a partition //See HCATALOG-490, also to avoid contacting the namenode for each new FileOutputFormat instance //In general this should be ok for most FileOutputFormat implementations //but may become an issue for cases when the method is used to perform other setup tasks //get Output Committer org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf() .getOutputCommitter(); //create currJobContext the latest so it gets all the config changes org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil .createJobContext(currTaskContext); //setupJob() baseOutputCommitter.setupJob(currJobContext); //recreate to refresh jobConf of currTask context currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible()); //set temp location currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath() .toString()); //setupTask() baseOutputCommitter.setupTask(currTaskContext); Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir")); Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, "part", "")); org.apache.hadoop.mapred.RecordWriter baseRecordWriter = baseOF.getRecordWriter( parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext)); baseDynamicWriters.put(dynKey, baseRecordWriter); baseDynamicSerDe.put(dynKey, currSerDe); baseDynamicCommitters.put(dynKey, baseOutputCommitter); dynamicContexts.put(dynKey, currTaskContext); dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema())); dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey))); } localJobInfo = dynamicOutputJobInfo.get(dynKey); localWriter = baseDynamicWriters.get(dynKey); localSerDe = baseDynamicSerDe.get(dynKey); localObjectInspector = dynamicObjectInspectors.get(dynKey); } else { localJobInfo = jobInfo; localWriter = getBaseRecordWriter(); localSerDe = serDe; localObjectInspector = objectInspector; } for (Integer colToDel : partColsToDel) { value.remove(colToDel); } //The key given by user is ignored try { localWriter.write(NullWritable.get(), localSerDe.serialize(value.getAll(), localObjectInspector)); } catch (SerDeException e) { throw new IOException("Failed to serialize object", e); } }
From source file:org.apache.hive.hcatalog.mapreduce.DynamicPartitionFileRecordWriterContainer.java
License:Apache License
@Override protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException { OutputJobInfo localJobInfo = null;// w w w . j av a 2 s . c om // Calculate which writer to use from the remaining values - this needs to // be done before we delete cols. List<String> dynamicPartValues = new ArrayList<String>(); for (Integer colToAppend : dynamicPartCols) { Object partitionValue = value.get(colToAppend); dynamicPartValues .add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString()); } String dynKey = dynamicPartValues.toString(); if (!baseDynamicWriters.containsKey(dynKey)) { if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) { throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed."); } org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil .createTaskAttemptContext(context); configureDynamicStorageHandler(currTaskContext, dynamicPartValues); localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration()); // Setup serDe. SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf()); try { InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo); } catch (SerDeException e) { throw new IOException("Failed to initialize SerDe", e); } // create base OutputFormat org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf()); // We are skipping calling checkOutputSpecs() for each partition // As it can throw a FileAlreadyExistsException when more than one // mapper is writing to a partition. // See HCATALOG-490, also to avoid contacting the namenode for each new // FileOutputFormat instance. // In general this should be ok for most FileOutputFormat implementations // but may become an issue for cases when the method is used to perform // other setup tasks. // Get Output Committer org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf() .getOutputCommitter(); // Create currJobContext the latest so it gets all the config changes org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext); // Set up job. baseOutputCommitter.setupJob(currJobContext); // Recreate to refresh jobConf of currTask context. currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible()); // Set temp location. currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath() .toString()); // Set up task. baseOutputCommitter.setupTask(currTaskContext); Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir")); Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), "")); RecordWriter baseRecordWriter = baseOF.getRecordWriter( parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext)); baseDynamicWriters.put(dynKey, baseRecordWriter); baseDynamicSerDe.put(dynKey, currSerDe); baseDynamicCommitters.put(dynKey, baseOutputCommitter); dynamicContexts.put(dynKey, currTaskContext); dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema())); dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration())); } return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey), baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey)); }
From source file:org.apache.trevni.avro.mapreduce.AvroTrevniRecordWriterBase.java
License:Apache License
/** * Constructor.// w w w.j a v a2 s .co m * @param context The TaskAttempContext to supply the writer with information form the job configuration */ public AvroTrevniRecordWriterBase(TaskAttemptContext context) throws IOException { schema = initSchema(context); meta = filterMetadata(context.getConfiguration()); writer = new AvroColumnWriter<T>(schema, meta, ReflectData.get()); Path outputPath = FileOutputFormat.getOutputPath(context); String dir = FileOutputFormat.getUniqueFile(context, "part", ""); dirPath = new Path(outputPath.toString() + "/" + dir); fs = dirPath.getFileSystem(context.getConfiguration()); fs.mkdirs(dirPath); blockSize = fs.getDefaultBlockSize(); }