Example usage for org.apache.hadoop.fs Path getFileSystem

List of usage examples for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException 

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.cloudera.science.quince.FileUtils.java

License:Open Source License

public static void deleteSampleGroup(Path path, Configuration conf, String sampleGroup) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (!fs.exists(path)) {
        return;/* w ww .jav  a2s  .  co m*/
    }
    for (FileStatus chrStatus : fs.listStatus(path, new PartitionPathFilter("chr"))) {
        for (FileStatus posStatus : fs.listStatus(chrStatus.getPath(), new PartitionPathFilter("pos"))) {
            for (FileStatus sampleGroupStatus : fs.listStatus(posStatus.getPath(),
                    new PartitionPathFilter("sample_group", sampleGroup))) {
                fs.delete(sampleGroupStatus.getPath(), true);
            }
        }
    }
}

From source file:com.cloudera.science.quince.LoadVariantsTool.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    JCommander jc = new JCommander(this);
    try {/*  w w  w  .ja v  a2s  . com*/
        jc.parse(args);
    } catch (ParameterException e) {
        jc.usage();
        return 1;
    }

    if (paths == null || paths.size() != 2) {
        jc.usage();
        return 1;
    }

    String inputPath = paths.get(0);
    String outputPath = paths.get(1);

    Configuration conf = getConf();
    // Copy records to avoid problem with Parquet string statistics not being correct.
    // This can be removed from parquet 1.8.0
    // (see https://issues.apache.org/jira/browse/PARQUET-251).
    conf.setBoolean(DatasetKeyOutputFormat.KITE_COPY_RECORDS, true);

    Path path = new Path(inputPath);

    if (path.getName().endsWith(".vcf")) {
        int size = 500000;
        byte[] bytes = new byte[size];
        InputStream inputStream = path.getFileSystem(conf).open(path);
        inputStream.read(bytes, 0, size);
        conf.set(VariantContextToVariantFn.VARIANT_HEADER, Base64.encodeBase64String(bytes));
    }

    Pipeline pipeline = new MRPipeline(getClass(), conf);
    PCollection<Variant> records = readVariants(path, conf, pipeline);

    PCollection<FlatVariant> flatRecords = records.parallelDo(new FlattenVariantFn(),
            Avros.specifics(FlatVariant.class));

    DatasetDescriptor desc = new DatasetDescriptor.Builder().schema(FlatVariant.getClassSchema())
            .partitionStrategy(buildPartitionStrategy(segmentSize)).format(Formats.PARQUET)
            .compressionType(CompressionType.Uncompressed).build();

    View<FlatVariant> dataset;
    if (Datasets.exists(outputPath)) {
        dataset = Datasets.load(outputPath, FlatVariant.class).getDataset().with("sample_group", sampleGroup);
    } else {
        dataset = Datasets.create(outputPath, desc, FlatVariant.class).getDataset().with("sample_group",
                sampleGroup);
    }

    int numReducers = conf.getInt("mapreduce.job.reduces", 1);
    System.out.println("Num reducers: " + numReducers);

    final Schema sortKeySchema = SchemaBuilder.record("sortKey").fields().requiredString("sampleId")
            .endRecord();

    PCollection<FlatVariant> partitioned = CrunchDatasets.partitionAndSort(flatRecords, dataset,
            new FlatVariantRecordMapFn(sortKeySchema), sortKeySchema, numReducers, 1);

    try {
        Target.WriteMode writeMode = overwrite ? Target.WriteMode.OVERWRITE : Target.WriteMode.DEFAULT;
        pipeline.write(partitioned, CrunchDatasets.asTarget(dataset), writeMode);
    } catch (CrunchRuntimeException e) {
        LOG.error("Crunch runtime error", e);
        return 1;
    }

    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;

}

From source file:com.cloudera.science.quince.SampleUtils.java

License:Open Source License

public static Set<String> uniqueSamples(Configuration conf, Path[] vcfs) throws IOException {
    Set<String> samples = new LinkedHashSet<>();
    for (Path vcf : vcfs) {
        InputStream inputStream = vcf.getFileSystem(conf).open(vcf);
        VcfBlockIterator iterator = new VcfBlockIterator(inputStream, new FullVcfCodec());
        VCFHeader header = iterator.getHeader();
        samples.addAll(header.getGenotypeSamples());
    }//from   www .j  a  v  a2s .  c om
    return samples;
}

From source file:com.cloudera.science.quince.SchemaUtils.java

License:Open Source License

public static Path findFile(Path path, Configuration conf) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (fs.isDirectory(path)) {
        FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() {
            @Override//from  w  w  w . j a  v a2 s . c o m
            public boolean accept(Path p) {
                String name = p.getName();
                return !name.startsWith("_") && !name.startsWith(".");
            }
        });
        return fileStatuses[0].getPath();
    } else {
        return path;
    }
}

From source file:com.cloudera.science.quince.VCFToGA4GHVariantFn.java

License:Open Source License

public static void configureHeaders(Configuration conf, Path[] vcfs, String sampleGroup) throws IOException {
    List<VCFHeader> headers = new ArrayList<>();
    for (Path vcf : vcfs) {
        InputStream inputStream = vcf.getFileSystem(conf).open(vcf);
        VcfBlockIterator iterator = new VcfBlockIterator(inputStream, new FullVcfCodec());
        VCFHeader header = iterator.getHeader();
        header.addMetaDataLine(new VCFHeaderLine(VARIANT_SET_ID, vcf.getName()));
        headers.add(header);//  w  w  w.  j a v  a  2s. c  om
    }
    VCFHeader[] headersArray = headers.toArray(new VCFHeader[headers.size()]);
    conf.set(VARIANT_HEADERS, Base64.encodeBase64String(SerializationUtils.serialize(headersArray)));
    if (sampleGroup != null) {
        conf.set(SAMPLE_GROUP, sampleGroup);
    }
}

From source file:com.cloudera.spark.bulkload.TotalOrderPartitioner.java

License:Apache License

/**
   * Read in the partition file and build indexing data structures.
   * If the keytype is {@link BinaryComparable} and
   * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
   * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
   * will be built. Otherwise, keys will be located using a binary search of
   * the partition keyset using the {@link RawComparator}
   * defined for this job. The input file must be sorted with the same
   * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys.
   *//*w w  w.  j a va  2  s  .co m*/
  @SuppressWarnings("unchecked") // keytype from conf not static
  public void setConf(Configuration conf) {
      try {
          this.conf = conf;
          String parts = getPartitionFile(conf);
          final Path partFile = new Path(parts);
          final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache
                  : partFile.getFileSystem(conf);

          Job job = new Job(conf);
          Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
          K[] splitPoints = readPartitions(fs, partFile, keyClass, conf);
          if (splitPoints.length != job.getNumReduceTasks() - 1) {
              throw new IOException("Wrong number of partitions in keyset");
          }
          RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
          for (int i = 0; i < splitPoints.length - 1; ++i) {
              if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) {
                  throw new IOException("Split points are out of order");
              }
          }
          boolean natOrder = conf.getBoolean(NATURAL_ORDER, true);
          if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
              partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0],
                      // Now that blocks of identical splitless trie nodes are 
                      // represented reentrantly, and we develop a leaf for any trie
                      // node with only one split point, the only reason for a depth
                      // limit is to refute stack overflow or bloat in the pathological
                      // case where the split points are long and mostly look like bytes 
                      // iii...iixii...iii   .  Therefore, we make the default depth
                      // limit large but not huge.
                      conf.getInt(MAX_TRIE_DEPTH, 200));
          } else {
              partitions = new BinarySearchNode(splitPoints, comparator);
          }
      } catch (IOException e) {
          throw new IllegalArgumentException("Can't read partitions file", e);
      }
  }

From source file:com.cloudera.sqoop.mapreduce.AvroOutputFormat.java

License:Apache License

@Override
public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(TaskAttemptContext context)
        throws IOException, InterruptedException {

    Schema schema = AvroJob.getMapOutputSchema(context.getConfiguration());

    final DataFileWriter<T> WRITER = new DataFileWriter<T>(new GenericDatumWriter<T>());

    Path path = getDefaultWorkFile(context, org.apache.avro.mapred.AvroOutputFormat.EXT);
    WRITER.create(schema, path.getFileSystem(context.getConfiguration()).create(path));

    return new RecordWriter<AvroWrapper<T>, NullWritable>() {
        @Override/*w  w  w . j  a v  a2 s  .com*/
        public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException {
            WRITER.append(wrapper.datum());
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            WRITER.close();
        }
    };
}

From source file:com.cloudera.sqoop.mapreduce.RawKeyTextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException {
    boolean isCompressed = getCompressOutput(context);
    Configuration conf = context.getConfiguration();
    String ext = "";
    CompressionCodec codec = null;//from  w  w w .  j a  va  2  s.  c  o  m

    if (isCompressed) {
        // create the named codec
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(context, GzipCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, conf);

        ext = codec.getDefaultExtension();
    }

    Path file = getDefaultWorkFile(context, ext);
    FileSystem fs = file.getFileSystem(conf);
    FSDataOutputStream fileOut = fs.create(file, false);
    DataOutputStream ostream = fileOut;

    if (isCompressed) {
        ostream = new DataOutputStream(codec.createOutputStream(fileOut));
    }

    return new RawKeyRecordWriter<K, V>(ostream);
}

From source file:com.cloudera.training.metrics.JobHistoryHelper.java

License:Apache License

public static JobHistory.JobInfo getJobInfoFromHdfsOutputDir(String outputDir, Configuration conf)
        throws IOException {
    Path output = new Path(outputDir);
    Path historyLogDir = new Path(output, "_logs/history");
    FileSystem fs = output.getFileSystem(conf);
    if (!fs.exists(output)) {
        throw new IOException("History directory " + historyLogDir.toString() + " does not exist");
    }//from ww w.  ja  v a  2  s.c om
    Path[] jobFiles = FileUtil.stat2Paths(fs.listStatus(historyLogDir, jobLogFileFilter));
    if (jobFiles.length == 0) {
        throw new IOException("Not a valid history directory " + historyLogDir.toString());
    }
    String[] jobDetails = JobHistory.JobInfo.decodeJobHistoryFileName(jobFiles[0].getName()).split("_");
    String jobId = jobDetails[2] + "_" + jobDetails[3] + "_" + jobDetails[4];
    JobHistory.JobInfo job = new JobHistory.JobInfo(jobId);
    DefaultJobHistoryParser.parseJobTasks(jobFiles[0].toString(), job, fs);
    return job;
}

From source file:com.cloudy.mapred.base.JobUtil.java

License:Apache License

public static void delete(Configuration conf, Path path) throws IOException {
    if (conf == null) {
        conf = new Configuration();
    }/* www  .ja va2s.c  o  m*/
    FileSystem fs = path.getFileSystem(conf);
    if (fs.exists(path)) {
        log.info("Deleting {}", path);
        fs.delete(path, true);
    }
}