Example usage for org.apache.hadoop.fs Path getFileSystem

List of usage examples for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException 

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

/**
 * If we have a directory recursively gather the files we care about for this job.
 *
 * @param file Root file/directory.//from   ww w . j  a  v  a2  s . com
 * @param job Job context.
 * @return All files we care about.
 * @throws IOException
 */
private Collection<FileStatus> handleFile(final FileStatus file, final JobContext job) throws IOException {
    final List<FileStatus> results = Lists.newArrayList();

    if (file.isDir()) {
        final Path p = file.getPath();
        LOG.debug("Expanding {}", p);
        final FileSystem fs = p.getFileSystem(job.getConfiguration());
        final FileStatus[] children = fs.listStatus(p);
        for (FileStatus child : children) {
            results.addAll(handleFile(child, job));
        }
    } else {
        results.add(file);
    }

    return results;
}

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(final JobContext job) throws IOException {
    final Configuration configuration = job.getConfiguration();

    final List<InputSplit> result = Lists.newArrayList();

    final List<FileStatus> files = listStatus(job);

    LOG.debug("Initial file list: {} {}", files.size(), files);

    for (final FileStatus fileStatus : files) {
        final Path dataFile = fileStatus.getPath();
        final FileSystem fileSystem = dataFile.getFileSystem(configuration);
        final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, 0,
                fileStatus.getLen());/*from  w ww .j  a va 2 s . c om*/

        // Data file, try to split if the .index file was found
        final SSTableIndexIndex index = indexes.get(dataFile);
        if (index == null) {
            throw new IOException("Index not found for " + dataFile);
        }

        for (final SSTableIndexIndex.Chunk chunk : index.getOffsets()) {
            // This isn't likely to work well because we are dealing with the index into uncompressed data...
            final int blockIndex = getBlockIndex(blockLocations,
                    chunk.getStart() / COMPRESSION_RATIO_ASSUMPTION);
            final SSTableSplit split = new SSTableSplit(dataFile, chunk.getStart(), chunk.getEnd(),
                    chunk.getEnd() - chunk.getStart(), blockLocations[blockIndex].getHosts());
            result.add(split);
        }
    }

    LOG.debug("Splits calculated: {} {}", result.size(), result);

    return result;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.GFInputFormat.java

License:Apache License

/**
 * Identifies filters provided in the job configuration and creates a list of
 * sorted hoplogs. If there are no sorted hoplogs, checks if the region has
 * sequential hoplogs//  w ww  . j  av a2 s.c  o  m
 * 
 * @return list of hoplogs
 * @throws IOException
 */
protected Collection<FileStatus> getHoplogs() throws IOException {
    String regionName = conf.get(INPUT_REGION);
    System.out.println("GFInputFormat: Region Name is " + regionName);
    if (regionName == null || regionName.trim().isEmpty()) {
        // incomplete job configuration, region name must be provided
        return new ArrayList<FileStatus>();
    }

    String home = conf.get(HOME_DIR, HDFSStore.DEFAULT_HOME_DIR);
    regionName = HdfsRegionManager.getRegionFolder(regionName);
    Path regionPath = new Path(home + "/" + regionName);
    FileSystem fs = regionPath.getFileSystem(conf);

    long start = conf.getLong(START_TIME, 0l);
    long end = conf.getLong(END_TIME, 0l);
    boolean checkpoint = conf.getBoolean(CHECKPOINT, true);

    // if the region contains flush hoplogs then the region is of type RW.
    Collection<FileStatus> hoplogs;
    hoplogs = HoplogUtil.filterHoplogs(fs, regionPath, start, end, checkpoint);
    return hoplogs == null ? new ArrayList<FileStatus>() : hoplogs;
}

From source file:com.geneix.bottle.WordRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    if (LOG.isInfoEnabled()) {
        LOG.info("Initializing WordRecordReader");
    }//w w  w  .  ja v a2s . c  o m
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxWordLength = job.getInt(MAX_WORD_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        throw new IOException("Cannot handle compressed files right now");
    } else {
        fileIn.seek(start);
        in = new WordReader(fileIn, job);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readWord(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.github.bskaggs.avro_json_hadoop.AvroAsJsonRecordReader.java

License:Apache License

/** {@inheritDoc} */
@Override//from w w  w.  j  a  v a  2  s .co m
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    if (!(inputSplit instanceof FileSplit)) {
        throw new IllegalArgumentException("Only compatible with FileSplits.");
    }
    FileSplit fileSplit = (FileSplit) inputSplit;

    // Open a seekable input stream to the Avro container file.
    SeekableInput seekableFileInput = createSeekableInput(context.getConfiguration(), fileSplit.getPath());

    // Wrap the seekable input stream in an Avro DataFileReader.
    Configuration conf = context.getConfiguration();
    GenericData dataModel = AvroSerialization.createDataModel(conf);

    GenericDatumReader<Object> reader = new GenericDatumReader<Object>();

    //Figure out the schema
    Path path = fileSplit.getPath();
    FSDataInputStream schemaStream = path.getFileSystem(conf).open(path);
    DataFileStream<Object> streamReader = new DataFileStream<Object>(schemaStream, reader);
    Schema mReaderSchema = streamReader.getSchema();
    streamReader.close();

    //Set up writer and encoder for json
    writer = new GenericDatumWriter<Object>(mReaderSchema);
    encoder = new TerseJsonEncoder(mReaderSchema, bout);

    @SuppressWarnings("unchecked")
    DatumReader<Object> datumReader = dataModel.createDatumReader(mReaderSchema);
    mAvroFileReader = createAvroFileReader(seekableFileInput, datumReader);

    // Initialize the start and end offsets into the file based on the boundaries of the
    // input split we're responsible for.  We will read the first block that begins
    // after the input split start boundary.  We will read up to but not including the
    // first block that starts after input split end boundary.

    // Sync to the closest block/record boundary just after beginning of our input split.
    mAvroFileReader.sync(fileSplit.getStart());

    // Initialize the start position to the beginning of the first block of the input split.
    mStartPosition = mAvroFileReader.previousSync();

    // Initialize the end position to the end of the input split (this isn't necessarily
    // on a block boundary so using this for reporting progress will be approximate.
    mEndPosition = fileSplit.getStart() + fileSplit.getLength();
}

From source file:com.github.bskaggs.mapreduce.flowfile.AbstractFlowFileV3RecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;

    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    fileStream = fs.open(file);// w w  w.  j ava2 s  .  c om

    startPos = fileSplit.getStart();
    nextPos = startPos;
    length = fileSplit.getLength();
    lastPos = nextPos + length;
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHFileOutputFormat.java

License:Apache License

@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath,
        Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties,
        final Progressable progressable) throws IOException {

    // Read configuration for the target path, first from jobconf, then from table properties
    String hfilePath = getFamilyPath(jc, tableProperties);
    if (hfilePath == null) {
        throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
    }//from w  w w  . j a v a 2  s  .  c  om

    // Target path's last component is also the column family name.
    final Path columnFamilyPath = new Path(hfilePath);
    final String columnFamilyName = columnFamilyPath.getName();
    final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
    final Job job = new Job(jc);
    setCompressOutput(job, isCompressed);
    setOutputPath(job, finalOutPath);

    // Create the HFile writer
    final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims()
            .newTaskAttemptContext(job.getConfiguration(), progressable);

    final Path outputdir = FileOutputFormat.getOutputPath(tac);
    final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(
            tac);

    // Individual columns are going to be pivoted to HBase cells,
    // and for each row, they need to be written out in order
    // of column name, so sort the column names now, creating a
    // mapping to their column position.  However, the first
    // column is interpreted as the row key.
    String columnList = tableProperties.getProperty("columns");
    String[] columnArray = columnList.split(",");
    final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    int i = 0;
    for (String columnName : columnArray) {
        if (i != 0) {
            columnMap.put(Bytes.toBytes(columnName), i);
        }
        ++i;
    }

    return new RecordWriter() {

        @Override
        public void close(boolean abort) throws IOException {
            try {
                fileWriter.close(null);
                if (abort) {
                    return;
                }
                // Move the hfiles file(s) from the task output directory to the
                // location specified by the user.
                FileSystem fs = outputdir.getFileSystem(jc);
                fs.mkdirs(columnFamilyPath);
                Path srcDir = outputdir;
                for (;;) {
                    FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
                    if ((files == null) || (files.length == 0)) {
                        throw new IOException("No family directories found in " + srcDir);
                    }
                    if (files.length != 1) {
                        throw new IOException("Multiple family directories found in " + srcDir);
                    }
                    srcDir = files[0].getPath();
                    if (srcDir.getName().equals(columnFamilyName)) {
                        break;
                    }
                }
                for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
                    fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
                }
                // Hive actually wants a file as task output (not a directory), so
                // replace the empty directory with an empty file to keep it happy.
                fs.delete(outputdir, true);
                fs.createNewFile(outputdir);
            } catch (InterruptedException ex) {
                throw new IOException(ex);
            }
        }

        private void writeText(Text text) throws IOException {
            // Decompose the incoming text row into fields.
            String s = text.toString();
            String[] fields = s.split("\u0001");
            assert (fields.length <= (columnMap.size() + 1));
            // First field is the row key.
            byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
            // Remaining fields are cells addressed by column name within row.
            for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
                byte[] columnNameBytes = entry.getKey();
                int iColumn = entry.getValue();
                String val;
                if (iColumn >= fields.length) {
                    // trailing blank field
                    val = "";
                } else {
                    val = fields[iColumn];
                    if ("\\N".equals(val)) {
                        // omit nulls
                        continue;
                    }
                }
                byte[] valBytes = Bytes.toBytes(val);
                KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
                try {
                    fileWriter.write(null, kv);
                } catch (IOException e) {
                    LOG.error("Failed while writing row: " + s);
                    throw e;
                } catch (InterruptedException ex) {
                    throw new IOException(ex);
                }
            }
        }

        private void writePut(PutWritable put) throws IOException {
            ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
            SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
            for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
                Collections.sort(entry.getValue(), new CellComparator());
                for (Cell c : entry.getValue()) {
                    try {
                        fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
                    } catch (InterruptedException e) {
                        throw (InterruptedIOException) new InterruptedIOException().initCause(e);
                    }
                }
            }
        }

        @Override
        public void write(Writable w) throws IOException {
            if (w instanceof Text) {
                writeText((Text) w);
            } else if (w instanceof PutWritable) {
                writePut((PutWritable) w);
            } else {
                throw new IOException("Unexpected writable " + w);
            }
        }
    };
}

From source file:com.github.gaoyangthu.demo.mapred.terasort.TeraSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();//from   w  w  w  . j a  v a2s  . c o m
    Path inputDir = new Path(args[0]);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", 1);
    TeraOutputFormat.setFinalSync(job, true);
    JobClient.runJob(job);
    LOG.info("done");
    return 0;
}

From source file:com.github.joshelser.accumulo.DelimitedIngest.java

License:Apache License

private List<Path> convertInputToPaths() throws IOException {
    List<String> inputs = args.getInput();
    List<Path> paths = new ArrayList<>(inputs.size());
    for (String input : inputs) {
        Path p = new Path(input);
        FileSystem fs = p.getFileSystem(conf);
        FileStatus fstat = fs.getFileStatus(p);
        if (fstat.isFile()) {
            paths.add(p);/*from   w  w  w  .  j a  v a  2 s .  c  o  m*/
        } else if (fstat.isDirectory()) {
            for (FileStatus child : fs.listStatus(p)) {
                if (child.isFile()) {
                    paths.add(child.getPath());
                }
            }
        } else {
            throw new IllegalStateException("Unable to handle that which is not file nor directory: " + p);
        }
    }
    return paths;
}

From source file:com.github.joshelser.accumulo.DelimitedIngest.java

License:Apache License

private void processSinglePathWithByteBuffer(BatchWriter writer, FileMapping mapping, Path p, CsvParser parser)
        throws IOException, MutationsRejectedException {
    final FileSystem fs = p.getFileSystem(conf);
    FSDataInputStream dis = fs.open(p, INPUT_BUFFER_SIZE);
    InputStreamReader reader = new InputStreamReader(dis, UTF_8);
    try {/*  w  ww  . ja  va2  s .co m*/
        parser.beginParsing(reader);
        String[] line = null;
        while ((line = parser.parseNext()) != null) {
            writer.addMutation(parseLine(mapping, line));
        }
    } finally {
        if (null != reader) {
            reader.close();
        }
    }
}