Example usage for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.alectenharmsel.research.WholeBlockRecordReader.java

License:Apache License

public boolean nextKeyValue() throws IOException, InterruptedException {
    if (!processed) {
        System.err.println("start is " + start);
        Path file = fileSplit.getPath();
        String tmp = file.toString();
        System.err.println("File: " + tmp);
        currKey.set(tmp);/*from   ww w  .jav a  2 s. c o  m*/
        System.err.println("Reached this point");
        FileSystem fs = file.getFileSystem(conf);
        System.err.println("fs blocksize: " + fs.getDefaultBlockSize(file));
        System.err.println("linecount blocksize: " + blockSize);
        byte[] contents;
        FSDataInputStream in = null;
        try {
            in = fs.open(file);
            System.err.println("getPos(): " + in.getPos());

            if ((start + blockSize) > fileLength) {
                blockSize = (int) (fileLength - start);
                processed = true;
            }

            contents = new byte[blockSize];

            //IOUtils.readFully(in, contents, start, blockSize);
            //IOUtils.readFully(in, contents, 0, blockSize);
            in.readFully(start, contents);

            start += blockSize;

            currValue.set(contents);
        } finally {
            IOUtils.closeStream(in);
        }
        return true;
    }
    return false;
}

From source file:com.alexholmes.hadooputils.combine.avro.mapred.CombineAvroInputFormatTest.java

License:Apache License

@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    JobConf job = new JobConf();
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    outputPath.getFileSystem(job).delete(outputPath);
    WordCountUtil.writeLinesFile();/*  w  w  w.  ja  v a2  s. com*/

    job.setJobName("wordcount");

    AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());

    AvroJob.setMapperClass(job, MapImpl.class);
    AvroJob.setCombinerClass(job, ReduceImpl.class);
    AvroJob.setReducerClass(job, ReduceImpl.class);

    job.setInputFormat(CombineAvroInputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, true);

    WordCountUtil.setMeta(job);

    JobClient.runJob(job);

    WordCountUtil.validateCountsFile();
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileInputFormat.java

License:Apache License

@Override
@SuppressWarnings("unchecked")
protected List<FileStatus> listStatus(JobContext job) throws IOException {

    List<FileStatus> files = super.listStatus(job);

    int len = files.size();
    for (int i = 0; i < len; ++i) {
        FileStatus file = files.get(i);/*from w  w  w  . j av  a  2 s.  c  om*/
        if (file.isDir()) { // it's a MapFile
            Path p = file.getPath();
            FileSystem fs = p.getFileSystem(job.getConfiguration());
            // use the data file
            files.set(i, fs.getFileStatus(new Path(p, MapFile.DATA_FILE_NAME)));
        }
    }
    return files;
}

From source file:com.alexholmes.hadooputils.sort.DelimitedLineRecordReader.java

License:Apache License

protected void initialize(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from www.j a va 2s.  c o m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    String rowDelim = job.get("textinputformat.record.delimiter", null);
    if (codec != null) {
        if (rowDelim != null) {
            byte[] hexcode = SortConfig.getHexDelimiter(rowDelim);
            in = new DelimitedLineReader(codec.createInputStream(fileIn), job,
                    (hexcode != null) ? hexcode : rowDelim.getBytes());
        } else {
            in = new DelimitedLineReader(codec.createInputStream(fileIn), job);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (rowDelim != null) {
            byte[] hexcode = SortConfig.getHexDelimiter(rowDelim);
            in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes());
        } else {
            in = new DelimitedLineReader(fileIn, job);
        }
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:com.alexholmes.hadooputils.sort.DelimitedTextOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress)
        throws IOException {

    SortConfig sortConf = new SortConfig(job);
    boolean isCompressed = getCompressOutput(job);
    String lineSeparator = sortConf.getRowSeparator("\n");
    byte[] hexcode = SortConfig.getHexDelimiter(lineSeparator);
    lineSeparator = (hexcode != null) ? new String(hexcode, "UTF-8") : lineSeparator;

    if (!isCompressed) {
        Path file = FileOutputFormat.getTaskOutputPath(job, name);
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new DelimitedLineRecordWriter<K, V>(fileOut, lineSeparator);
    } else {/*from   ww  w.  j av  a  2 s  .c  om*/
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
        Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension());
        FileSystem fs = file.getFileSystem(job);
        FSDataOutputStream fileOut = fs.create(file, progress);
        return new DelimitedLineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
                lineSeparator);
    }
}

From source file:com.alexholmes.hadooputils.sort.LzoDelimitedLineRecordReader.java

License:Apache License

@Override
protected void initialize(Configuration job, FileSplit split) throws IOException {
    start = split.getStart();//from  w  w w .  ja  v a2s.  c  o m
    end = start + split.getLength();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("No codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    String rowDelim = job.get("textinputformat.record.delimiter", null);
    if (rowDelim != null) {
        byte[] hexcode = SortConfig.getHexDelimiter(rowDelim);
        in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes());
    } else {
        in = new DelimitedLineReader(codec.createInputStream(fileIn), job);
    }

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for the sort MapReduce job.
 *
 * @param jobConf           sort configuration
 * @param numMapTasks       number of map tasks
 * @param numReduceTasks    number of reduce tasks
 * @param sampler           sampler, if required
 * @param codecClass        the compression codec for compressing final outputs
 * @param mapCodecClass     the compression codec for compressing intermediary map outputs
 * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
 *                          for the job output files
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws IOException        if something went wrong
 * @throws URISyntaxException if a URI wasn't correctly formed
 *//*from   w  w  w.  j  a  v  a2 s  . c o m*/
public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks,
        final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass,
        final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes,
        final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException {

    jobConf.setJarByClass(Sort.class);
    jobConf.setJobName("sorter");

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    if (numMapTasks != null) {
        jobConf.setNumMapTasks(numMapTasks);
    }
    if (numReduceTasks != null) {
        jobConf.setNumReduceTasks(numReduceTasks);
    } else {
        int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sort.reduces_per_host");
        if (sortReduces != null) {
            numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }

        // Set user-supplied (possibly default) job configs
        jobConf.setNumReduceTasks(numReduces);
    }

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(SortReduce.class);

    jobConf.setInputFormat(SortInputFormat.class);

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    if (mapCodecClass != null) {
        jobConf.setMapOutputCompressorClass(mapCodecClass);
    }

    if (codecClass != null) {
        jobConf.setBoolean("mapred.output.compress", true);
        jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
    }

    FileInputFormat.setInputPaths(jobConf, inputDirAsString);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];

        FileSystem fileSystem = FileSystem.get(jobConf);

        if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
            inputDir = inputDir.getParent();
        }
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + jobConf.getNumReduceTasks() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    if (jobResult.isSuccessful()) {
        if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
            new LzoIndexer(jobConf).index(new Path(outputDirAsString));
        }
        return true;
    }
    return false;
}

From source file:com.alexholmes.hadooputils.sort.SortInputSampler.java

License:Apache License

public static <K, V> void writePartitionFile(JobConf job, Sampler<K, V> sampler) throws IOException {
    Configuration conf = job;// ww  w .ja va2  s.c  o m
    // Use the input format defined in the job. NOT, the one provided by
    // the parent class's writePartitionFile() method, which will be a plain
    // TextInputFormat, by default
    final InputFormat inf = job.getInputFormat();
    int numPartitions = job.getNumReduceTasks();
    K[] samples = (K[]) sampler.getSample(inf, job);
    RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();
    Arrays.sort(samples, comparator);
    Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
    FileSystem fs = dst.getFileSystem(conf);
    if (fs.exists(dst)) {
        fs.delete(dst, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(),
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    float stepSize = samples.length / (float) numPartitions;
    int last = -1;
    for (int i = 1; i < numPartitions; ++i) {
        int k = Math.round(stepSize * i);
        while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
            ++k;
        }
        writer.append(samples[k], nullValue);
        last = k;
    }
    writer.close();
}

From source file:com.alexholmes.hdfsslurper.Configurator.java

License:Apache License

public static void validateSameFileSystem(Path p1, Path p2, Configuration config)
        throws IOException, ConfigSettingException {
    FileSystem fs1 = p1.getFileSystem(config);
    FileSystem fs2 = p2.getFileSystem(config);
    if (!compareFs(fs1, fs2)) {
        throw new ConfigSettingException("The two paths must exist on the same file system: " + p1 + "," + p2);
    }/*from w w  w .ja v a 2  s  . co  m*/

    if (p1.equals(p2)) {
        throw new ConfigSettingException("The paths must be distinct: " + p1);
    }
}

From source file:com.alexholmes.hdfsslurper.Configurator.java

License:Apache License

public static void testCreateDir(Path p, Configuration conf)
        throws IOException, ConfigSettingException, FileSystemMkdirFailed {
    FileSystem fs = p.getFileSystem(conf);
    if (fs.exists(p) && !fs.getFileStatus(p).isDir()) {
        throw new ConfigSettingException("Directory appears to be a file: '" + p + "'");
    }// w ww .  j a  v a 2  s. c o  m

    if (!fs.exists(p)) {
        log.info("Attempting creation of directory: " + p);
        if (!fs.mkdirs(p)) {
            throw new FileSystemMkdirFailed("Failed to create directory: '" + p + "'");
        }
    }
}