Example usage for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.cloudera.knittingboar.sgd.POLRMasterDriver.java

License:Apache License

/**
 * [ needs to be checked ]/*  w  w w  .  j  a  va 2  s  .  c o  m*/
 * 
 * NOTE: This should only be used for durability purposes in checkpointing the
 * workers
 * 
 * @param outputFilename
 * @param conf
 * @throws Exception
 */
public void SaveModelToHDFS(String outputFilename, Configuration conf) throws Exception {

    Path path = new Path(outputFilename);
    FileSystem fs = path.getFileSystem(conf);
    FSDataOutputStream modelHDFSOutput = fs.create(path, true);

    try {
        polr_modelparams.saveTo(modelHDFSOutput);
    } finally {
        modelHDFSOutput.close();
    }

}

From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java

License:Apache License

/**
 * @param args the cli arguments// w w  w. j  av  a 2 s. c o m
 */
@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Job job = Job.getInstance(getConf());
    if (args.length != 2) {
        usage();
        return 2;
    }
    setNumberOfRows(job, parseHumanLong(args[0]));
    Path outputDir = new Path(args[1]);
    if (outputDir.getFileSystem(getConf()).exists(outputDir)) {
        throw new IOException("Output directory " + outputDir + " already exists.");
    }
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("TeraGen");
    job.setJarByClass(TeraGen.class);
    job.setMapperClass(SortGenMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(RangeInputFormat.class);
    job.setOutputFormatClass(TeraOutputFormat.class);
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param job the job to sample/*from www .ja va 2s .c o  m*/
 * @param partFile where to write the output file to
 * @throws Throwable if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            @Override
            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java

License:Apache License

@Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException {
    Path file = getDefaultWorkFile(job, "");
    FileSystem fs = file.getFileSystem(job.getConfiguration());
    FSDataOutputStream fileOut = fs.create(file);
    return new TeraRecordWriter(fileOut, job);
}

From source file:com.cloudera.recordservice.mapreduce.testapps.RecordCount.java

License:Apache License

public static long countRecords(String path) throws IOException {
    String output = TestUtil.getTempDirectory();
    Path inputPath = new Path(path);
    Path outputPath = new Path(output);

    JobConf conf = new JobConf(RecordCount.class);
    conf.setJobName("recordcount");

    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(LongWritable.class);

    conf.setInt("mapreduce.job.reduces", 1);
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    JobClient.runJob(conf);//w w w .j a v a2 s  .c  o m

    // Read the result and return it. Since we set the number of reducers to 1,
    // there is always just one file containing the value.
    FileSystem fs = outputPath.getFileSystem(conf);
    FSDataInputStream resultStream = fs.open(new Path(output + "/part-00000"));
    byte[] bytes = new byte[16];
    int length = resultStream.read(bytes);
    String result = new String(bytes, 0, length).trim();
    return Long.parseLong(result);
}

From source file:com.cloudera.recordservice.pig.HCatRSLoader.java

License:Apache License

/**
 * A utility method to get the size of inputs. This is accomplished by summing the
 * size of all input paths on supported FileSystems. Locations whose size cannot be
 * determined are ignored. Note non-FileSystem and unpartitioned locations will not
 * report their input size by default. This method was copied from HcatBaseLoader to use
 * the Record Service InputJobInfo./* ww w  .  ja  va 2 s. c  om*/
 */
protected static long getSizeInBytes(InputJobInfo inputJobInfo) throws IOException {
    Configuration conf = new Configuration();
    long sizeInBytes = 0;

    for (PartInfo partInfo : inputJobInfo.getPartitions()) {
        try {
            Path p = new Path(partInfo.getLocation());
            if (p.getFileSystem(conf).isFile(p)) {
                sizeInBytes += p.getFileSystem(conf).getFileStatus(p).getLen();
            } else {
                FileStatus[] fileStatuses = p.getFileSystem(conf).listStatus(p);
                if (fileStatuses != null) {
                    for (FileStatus child : fileStatuses) {
                        sizeInBytes += child.getLen();
                    }
                }
            }
        } catch (IOException e) {
            // Report size to the extent possible.
        }
    }
    LOG.info("SIZE:" + sizeInBytes + "\n\n");
    return sizeInBytes;
}

From source file:com.cloudera.sa.ExcelRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    Configuration conf = context.getConfiguration();
    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(conf);
    this.in = fs.open(file);
    XSSFWorkbook workbook = new XSSFWorkbook(this.in);
    XSSFSheet sheet = workbook.getSheetAt(0);
    this.totalRows = sheet.getPhysicalNumberOfRows();
    this.processedRows = 0;
    this.rowIterator = sheet.rowIterator();
}

From source file:com.cloudera.science.avro.streaming.AvroAsJSONOutputFormat.java

License:Open Source License

@Override
public RecordWriter<Text, Text> getRecordWriter(FileSystem ignored, JobConf job, String name,
        Progressable progress) throws IOException {
    if (schema == null) {
        SchemaLoader loader = new SchemaLoader(job);
        this.schema = loader.load(job.get(SCHEMA_LITERAL), job.get(SCHEMA_URL), job.get(SCHEMA_TYPE_NAME));
        this.converter = new JsonConverter(schema);
        this.readKey = job.getBoolean(READ_KEY, true);
    }/*ww w.  ja  v  a  2  s.co m*/

    DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(
            new GenericDatumWriter<GenericRecord>(schema));
    if (getCompressOutput(job)) {
        int level = job.getInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
        String codecName = job.get(AvroJob.CONF_OUTPUT_CODEC,
                org.apache.avro.file.DataFileConstants.DEFLATE_CODEC);
        CodecFactory codec = codecName.equals(DataFileConstants.DEFLATE_CODEC)
                ? CodecFactory.deflateCodec(level)
                : CodecFactory.fromString(codecName);
        writer.setCodec(codec);
    }
    writer.setSyncInterval(
            job.getInt(AvroOutputFormat.SYNC_INTERVAL_KEY, DataFileConstants.DEFAULT_SYNC_INTERVAL));

    Path path = FileOutputFormat.getTaskOutputPath(job, name + AvroOutputFormat.EXT);
    writer.create(schema, path.getFileSystem(job).create(path));

    return new AvroAsJSONRecordWriter(writer, converter, readKey);
}

From source file:com.cloudera.science.quince.FileUtils.java

License:Open Source License

public static Path[] findVcfs(Path path, Configuration conf) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (fs.isDirectory(path)) {
        FileStatus[] fileStatuses = fs.listStatus(path, new HiddenPathFilter());
        Path[] vcfs = new Path[fileStatuses.length];
        int i = 0;
        for (FileStatus status : fileStatuses) {
            vcfs[i++] = status.getPath();
        }/* ww  w.ja  va 2 s  .  c  om*/
        return vcfs;
    } else {
        return new Path[] { path };
    }
}

From source file:com.cloudera.science.quince.FileUtils.java

License:Open Source License

public static boolean sampleGroupExists(Path path, Configuration conf, String sampleGroup) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (!fs.exists(path)) {
        return false;
    }//from   w ww.  j av  a  2 s .  c o m
    for (FileStatus chrStatus : fs.listStatus(path, new PartitionPathFilter("chr"))) {
        for (FileStatus posStatus : fs.listStatus(chrStatus.getPath(), new PartitionPathFilter("pos"))) {
            if (fs.listStatus(posStatus.getPath(),
                    new PartitionPathFilter("sample_group", sampleGroup)).length > 0) {
                return true;
            }
        }
    }
    return false;
}