List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.cloudera.science.quince.FileUtils.java
License:Open Source License
public static void deleteSampleGroup(Path path, Configuration conf, String sampleGroup) throws IOException { FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { return;/* w ww .jav a2s . co m*/ } for (FileStatus chrStatus : fs.listStatus(path, new PartitionPathFilter("chr"))) { for (FileStatus posStatus : fs.listStatus(chrStatus.getPath(), new PartitionPathFilter("pos"))) { for (FileStatus sampleGroupStatus : fs.listStatus(posStatus.getPath(), new PartitionPathFilter("sample_group", sampleGroup))) { fs.delete(sampleGroupStatus.getPath(), true); } } } }
From source file:com.cloudera.science.quince.LoadVariantsTool.java
License:Open Source License
@Override public int run(String[] args) throws Exception { JCommander jc = new JCommander(this); try {/* w w w .ja v a2s . com*/ jc.parse(args); } catch (ParameterException e) { jc.usage(); return 1; } if (paths == null || paths.size() != 2) { jc.usage(); return 1; } String inputPath = paths.get(0); String outputPath = paths.get(1); Configuration conf = getConf(); // Copy records to avoid problem with Parquet string statistics not being correct. // This can be removed from parquet 1.8.0 // (see https://issues.apache.org/jira/browse/PARQUET-251). conf.setBoolean(DatasetKeyOutputFormat.KITE_COPY_RECORDS, true); Path path = new Path(inputPath); if (path.getName().endsWith(".vcf")) { int size = 500000; byte[] bytes = new byte[size]; InputStream inputStream = path.getFileSystem(conf).open(path); inputStream.read(bytes, 0, size); conf.set(VariantContextToVariantFn.VARIANT_HEADER, Base64.encodeBase64String(bytes)); } Pipeline pipeline = new MRPipeline(getClass(), conf); PCollection<Variant> records = readVariants(path, conf, pipeline); PCollection<FlatVariant> flatRecords = records.parallelDo(new FlattenVariantFn(), Avros.specifics(FlatVariant.class)); DatasetDescriptor desc = new DatasetDescriptor.Builder().schema(FlatVariant.getClassSchema()) .partitionStrategy(buildPartitionStrategy(segmentSize)).format(Formats.PARQUET) .compressionType(CompressionType.Uncompressed).build(); View<FlatVariant> dataset; if (Datasets.exists(outputPath)) { dataset = Datasets.load(outputPath, FlatVariant.class).getDataset().with("sample_group", sampleGroup); } else { dataset = Datasets.create(outputPath, desc, FlatVariant.class).getDataset().with("sample_group", sampleGroup); } int numReducers = conf.getInt("mapreduce.job.reduces", 1); System.out.println("Num reducers: " + numReducers); final Schema sortKeySchema = SchemaBuilder.record("sortKey").fields().requiredString("sampleId") .endRecord(); PCollection<FlatVariant> partitioned = CrunchDatasets.partitionAndSort(flatRecords, dataset, new FlatVariantRecordMapFn(sortKeySchema), sortKeySchema, numReducers, 1); try { Target.WriteMode writeMode = overwrite ? Target.WriteMode.OVERWRITE : Target.WriteMode.DEFAULT; pipeline.write(partitioned, CrunchDatasets.asTarget(dataset), writeMode); } catch (CrunchRuntimeException e) { LOG.error("Crunch runtime error", e); return 1; } PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
From source file:com.cloudera.science.quince.SampleUtils.java
License:Open Source License
public static Set<String> uniqueSamples(Configuration conf, Path[] vcfs) throws IOException { Set<String> samples = new LinkedHashSet<>(); for (Path vcf : vcfs) { InputStream inputStream = vcf.getFileSystem(conf).open(vcf); VcfBlockIterator iterator = new VcfBlockIterator(inputStream, new FullVcfCodec()); VCFHeader header = iterator.getHeader(); samples.addAll(header.getGenotypeSamples()); }//from www .j a v a2s . c om return samples; }
From source file:com.cloudera.science.quince.SchemaUtils.java
License:Open Source License
public static Path findFile(Path path, Configuration conf) throws IOException { FileSystem fs = path.getFileSystem(conf); if (fs.isDirectory(path)) { FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() { @Override//from w w w . j a v a2 s . c o m public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }); return fileStatuses[0].getPath(); } else { return path; } }
From source file:com.cloudera.science.quince.VCFToGA4GHVariantFn.java
License:Open Source License
public static void configureHeaders(Configuration conf, Path[] vcfs, String sampleGroup) throws IOException { List<VCFHeader> headers = new ArrayList<>(); for (Path vcf : vcfs) { InputStream inputStream = vcf.getFileSystem(conf).open(vcf); VcfBlockIterator iterator = new VcfBlockIterator(inputStream, new FullVcfCodec()); VCFHeader header = iterator.getHeader(); header.addMetaDataLine(new VCFHeaderLine(VARIANT_SET_ID, vcf.getName())); headers.add(header);// w w w. j a v a 2s. c om } VCFHeader[] headersArray = headers.toArray(new VCFHeader[headers.size()]); conf.set(VARIANT_HEADERS, Base64.encodeBase64String(SerializationUtils.serialize(headersArray))); if (sampleGroup != null) { conf.set(SAMPLE_GROUP, sampleGroup); } }
From source file:com.cloudera.spark.bulkload.TotalOrderPartitioner.java
License:Apache License
/** * Read in the partition file and build indexing data structures. * If the keytype is {@link BinaryComparable} and * <tt>total.order.partitioner.natural.order</tt> is not false, a trie * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes * will be built. Otherwise, keys will be located using a binary search of * the partition keyset using the {@link RawComparator} * defined for this job. The input file must be sorted with the same * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys. *//*w w w. j a va 2 s .co m*/ @SuppressWarnings("unchecked") // keytype from conf not static public void setConf(Configuration conf) { try { this.conf = conf; String parts = getPartitionFile(conf); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache : partFile.getFileSystem(conf); Job job = new Job(conf); Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass(); K[] splitPoints = readPartitions(fs, partFile, keyClass, conf); if (splitPoints.length != job.getNumReduceTasks() - 1) { throw new IOException("Wrong number of partitions in keyset"); } RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); for (int i = 0; i < splitPoints.length - 1; ++i) { if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) { throw new IOException("Split points are out of order"); } } boolean natOrder = conf.getBoolean(NATURAL_ORDER, true); if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) { partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0], // Now that blocks of identical splitless trie nodes are // represented reentrantly, and we develop a leaf for any trie // node with only one split point, the only reason for a depth // limit is to refute stack overflow or bloat in the pathological // case where the split points are long and mostly look like bytes // iii...iixii...iii . Therefore, we make the default depth // limit large but not huge. conf.getInt(MAX_TRIE_DEPTH, 200)); } else { partitions = new BinarySearchNode(splitPoints, comparator); } } catch (IOException e) { throw new IllegalArgumentException("Can't read partitions file", e); } }
From source file:com.cloudera.sqoop.mapreduce.AvroOutputFormat.java
License:Apache License
@Override public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { Schema schema = AvroJob.getMapOutputSchema(context.getConfiguration()); final DataFileWriter<T> WRITER = new DataFileWriter<T>(new GenericDatumWriter<T>()); Path path = getDefaultWorkFile(context, org.apache.avro.mapred.AvroOutputFormat.EXT); WRITER.create(schema, path.getFileSystem(context.getConfiguration()).create(path)); return new RecordWriter<AvroWrapper<T>, NullWritable>() { @Override/*w w w . j a v a2 s .com*/ public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException { WRITER.append(wrapper.datum()); } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { WRITER.close(); } }; }
From source file:com.cloudera.sqoop.mapreduce.RawKeyTextOutputFormat.java
License:Apache License
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException { boolean isCompressed = getCompressOutput(context); Configuration conf = context.getConfiguration(); String ext = ""; CompressionCodec codec = null;//from w w w . j a va 2 s. c o m if (isCompressed) { // create the named codec Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(context, GzipCodec.class); codec = ReflectionUtils.newInstance(codecClass, conf); ext = codec.getDefaultExtension(); } Path file = getDefaultWorkFile(context, ext); FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, false); DataOutputStream ostream = fileOut; if (isCompressed) { ostream = new DataOutputStream(codec.createOutputStream(fileOut)); } return new RawKeyRecordWriter<K, V>(ostream); }
From source file:com.cloudera.training.metrics.JobHistoryHelper.java
License:Apache License
public static JobHistory.JobInfo getJobInfoFromHdfsOutputDir(String outputDir, Configuration conf) throws IOException { Path output = new Path(outputDir); Path historyLogDir = new Path(output, "_logs/history"); FileSystem fs = output.getFileSystem(conf); if (!fs.exists(output)) { throw new IOException("History directory " + historyLogDir.toString() + " does not exist"); }//from ww w. ja v a 2 s.c om Path[] jobFiles = FileUtil.stat2Paths(fs.listStatus(historyLogDir, jobLogFileFilter)); if (jobFiles.length == 0) { throw new IOException("Not a valid history directory " + historyLogDir.toString()); } String[] jobDetails = JobHistory.JobInfo.decodeJobHistoryFileName(jobFiles[0].getName()).split("_"); String jobId = jobDetails[2] + "_" + jobDetails[3] + "_" + jobDetails[4]; JobHistory.JobInfo job = new JobHistory.JobInfo(jobId); DefaultJobHistoryParser.parseJobTasks(jobFiles[0].toString(), job, fs); return job; }
From source file:com.cloudy.mapred.base.JobUtil.java
License:Apache License
public static void delete(Configuration conf, Path path) throws IOException { if (conf == null) { conf = new Configuration(); }/* www .ja va2s.c o m*/ FileSystem fs = path.getFileSystem(conf); if (fs.exists(path)) { log.info("Deleting {}", path); fs.delete(path, true); } }