Example usage for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java

License:Apache License

/**
 * Add a {@link Path} to the list of inputs for the map-reduce job.
 * //from w  w w.j  a v a  2s  . c o  m
 * @param job The {@link Job} to modify
 * @param path {@link Path} to be added to the list of inputs for 
 *            the map-reduce job.
 */
public static void addInputPath(Job job, Path path) throws IOException {
    Configuration conf = job.getConfiguration();
    path = path.getFileSystem(conf).makeQualified(path);
    String dirStr = StringUtils.escapeString(path.toString());
    String dirs = conf.get(INPUT_DIR);
    conf.set(INPUT_DIR, dirs == null ? dirStr : dirs + "," + dirStr);
}

From source file:com.dinglicom.clouder.mapreduce.input.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    System.out.println("-------------------length:" + split.getLength() + "\tposition:" + split.getStart());
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();// www .ja  v a  2s . co  m
    end = start + split.getLength();
    final Path file = split.getPath();
    key = new Text(FileToCDRType.getTypeByPath(file.getName()));
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java

License:Open Source License

public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException {
    LOG.warn("split start: " + split.getStart());
    LOG.warn("split length: " + split.getLength());
    String[] locs = split.getLocations();
    for (String loc : locs) {
        LOG.warn("location: " + loc);
    }// ww w  . jav  a 2s  .c o m
    start = split.getStart();
    end = start + split.getLength();
    LOG.warn("split end: " + end);
    final Path file = split.getPath();
    LOG.warn("file: " + file.getName());
    LOG.warn("INT split start: " + (int) split.getStart());
    LOG.warn("INT split length: " + (int) split.getLength());
    LOG.warn("INT split end: " + (int) end);

    FileSystem fs = file.getFileSystem(conf);
    codecFactory = new CompressionCodecFactory(conf);
    final CompressionCodec codec = codecFactory.getCodec(file);
    LOG.warn("codec: " + codec.toString());
    LOG.warn("config: " + conf.toString());
    if (codec == null) {
        throw new IOException("No LZO codec found, cannot run.");
    }

    // Open the file and seek to the next split.
    fileIn = fs.open(file);
    // Create input stream and read the file header.
    in = new LineReader(codec.createInputStream(fileIn), conf);
    if (start != 0) {
        fileIn.seek(start);
        LOG.warn("fileIn position: " + fileIn.getPos());
        LOG.warn("buffer size: " + conf.get("io.file.buffer.size"));

        // Read and ignore the first line.
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    pos = start;
}

From source file:com.ebay.erl.mobius.core.criterion.TupleRestrictions.java

License:Apache License

/**
 * Create a tuple criterion that only accepts tuples when the value 
 * of the <code>column</code> are presented in the given <code>file</code>
 * <p>//  ww w . ja v  a2s  .com
 * 
 * The assumption of the file is that, it's single column and one to many
 * line text file.  Each line is read into a case insensitive set, and 
 * using the set to check the value of the <code>column</code> within
 * the set or not.
 * 
 * 
 * @param column the name of a column to be tested that whether its value is in 
 * the given <code>file</code> or not
 * 
 * @param file a single column and multiple lines of file that contains strings/numbers,
 * each line is treated as a single unit.
 *
 * @return an instance of {@link TupleCriterion} that extracts only the records 
 * when the value of its <code>column</code> are presented in the given 
 * <code>file</code>.
 * 
 * @throws FileNotFoundException if the given file cannot be found.
 */
public static TupleCriterion within(final String column, File file) throws FileNotFoundException {
    final File f = TupleRestrictions.checkFileExist(file);

    return new TupleCriterion() {

        private static final long serialVersionUID = -1121221619118915652L;
        private Set<String> set;

        @Override
        public void setConf(Configuration conf) {
            try {
                if (conf.get("tmpfiles") == null || conf.get("tmpfiles").trim().length() == 0) {
                    conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf));
                } else {
                    conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf) + "," + conf.get("tmpfiles"));
                }

            } catch (IOException e) {
                throw new IllegalArgumentException(e);
            }
        }

        /**
         * COPIED FROM org.apache.hadoop.util.GenericOptionsParser
         */
        private String validateFiles(String files, Configuration conf) throws IOException {
            if (files == null)
                return null;
            String[] fileArr = files.split(",");
            String[] finalArr = new String[fileArr.length];
            for (int i = 0; i < fileArr.length; i++) {
                String tmp = fileArr[i];
                String finalPath;
                Path path = new Path(tmp);
                URI pathURI = path.toUri();
                FileSystem localFs = FileSystem.getLocal(conf);
                if (pathURI.getScheme() == null) {
                    // default to the local file system
                    // check if the file exists or not first
                    if (!localFs.exists(path)) {
                        throw new FileNotFoundException("File " + tmp + " does not exist.");
                    }
                    finalPath = path.makeQualified(localFs).toString();
                } else {
                    // check if the file exists in this file system
                    // we need to recreate this filesystem object to copy
                    // these files to the file system jobtracker is running
                    // on.
                    FileSystem fs = path.getFileSystem(conf);
                    if (!fs.exists(path)) {
                        throw new FileNotFoundException("File " + tmp + " does not exist.");
                    }
                    finalPath = path.makeQualified(fs).toString();
                    try {
                        fs.close();
                    } catch (IOException e) {
                    }
                    ;
                }
                finalArr[i] = finalPath;
            }
            return StringUtils.arrayToString(finalArr);
        }

        @Override
        protected boolean evaluate(Tuple tuple, Configuration configuration) {
            if (set == null) {
                set = new CaseInsensitiveTreeSet();
                BufferedReader br = null;
                try {
                    br = new BufferedReader(new FileReader(new File(f.getName())));
                    String newLine = null;
                    while ((newLine = br.readLine()) != null) {
                        this.set.add(newLine);
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                } finally {
                    try {
                        br.close();
                    } catch (Throwable e) {
                    }
                }
            }

            String value = tuple.getString(column);
            if (value != null) {
                return this.set.contains(value);
            } else {
                return false;
            }
        }

        @Override
        public String[] getInvolvedColumns() {
            return new String[] { column };
        }
    };
}

From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java

License:Apache License

/**
 * Read in the partition file and build indexing data structures.
 * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
 * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 * will be built. Otherwise, keys will be located using a binary search of
 * the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
 * defined for this job. The input file must be sorted with the same
 * comparator and contain {@link/*from   w  w w .j  a va2 s  .  c om*/
org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys.
 */
@SuppressWarnings("unchecked") // keytype from conf not static
public void configure(JobConf job) {
    try {
        String parts = getPartitionFile(job);
        final Path partFile = new Path(parts);
        final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(job) // assume in DistributedCache
                : partFile.getFileSystem(job);

        //Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
        K[] splitPoints = readPartitions(fs, partFile, (Class<K>) Tuple.class, job);
        if (splitPoints.length != job.getNumReduceTasks() - 1) {
            throw new IOException("Wrong number of partitions in keyset");
        }
        RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();
        for (int i = 0; i < splitPoints.length - 1; ++i) {
            if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) {
                throw new IOException("Split points are out of order");
            }
        }
        boolean natOrder = job.getBoolean("total.order.partitioner.natural.order", true);
        if (natOrder && BinaryComparable.class.isAssignableFrom(Tuple.class)) {
            partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0],
                    job.getInt("total.order.partitioner.max.trie.depth", 2));
        } else {
            partitions = new BinarySearchNode(splitPoints, comparator);
        }
    } catch (IOException e) {
        throw new IllegalArgumentException("Can't read partitions file", e);
    }
}

From source file:com.ebay.nest.io.nestfile.FileDump.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    for (String filename : args) {
        System.out.println("Structure for " + filename);
        Path path = new Path(filename);
        Reader reader = OrcFile.createReader(path.getFileSystem(conf), path);
        RecordReaderImpl rows = (RecordReaderImpl) reader.rows(null);
        System.out.println("Rows: " + reader.getNumberOfRows());
        System.out.println("Compression: " + reader.getCompression());
        if (reader.getCompression() != CompressionKind.NONE) {
            System.out.println("Compression size: " + reader.getCompressionSize());
        }/*www  .j ava2s  .com*/
        System.out.println("Type: " + reader.getObjectInspector().getTypeName());
        ColumnStatistics[] stats = reader.getStatistics();
        System.out.println("\nStatistics:");
        for (int i = 0; i < stats.length; ++i) {
            System.out.println("  Column " + i + ": " + stats[i].toString());
        }
        System.out.println("\nStripes:");
        for (StripeInformation stripe : reader.getStripes()) {
            long stripeStart = stripe.getOffset();
            System.out.println("  Stripe: " + stripe.toString());
            OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
            long sectionStart = stripeStart;
            for (OrcProto.Stream section : footer.getStreamsList()) {
                System.out.println("    Stream: column " + section.getColumn() + " section " + section.getKind()
                        + " start: " + sectionStart + " length " + section.getLength());
                sectionStart += section.getLength();
            }
            for (int i = 0; i < footer.getColumnsCount(); ++i) {
                OrcProto.ColumnEncoding encoding = footer.getColumns(i);
                StringBuilder buf = new StringBuilder();
                buf.append("    Encoding column ");
                buf.append(i);
                buf.append(": ");
                buf.append(encoding.getKind());
                if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY) {
                    buf.append("[");
                    buf.append(encoding.getDictionarySize());
                    buf.append("]");
                }
                System.out.println(buf);
            }
        }
    }
}

From source file:com.ebay.nest.io.nestfile.OrcFile.java

License:Apache License

public static Writer createWriter(Path path, WriterOptions opts) throws IOException {
    FileSystem fs = opts.fileSystemValue == null ? path.getFileSystem(opts.configuration)
            : opts.fileSystemValue;/*from   w w  w.j a  va2s .c o m*/

    return new WriterImpl(fs, path, opts.configuration, opts.inspectorValue, opts.stripeSizeValue,
            opts.compressValue, opts.bufferSizeValue, opts.rowIndexStrideValue, opts.memoryManagerValue,
            opts.blockPaddingValue, opts.versionValue);
}

From source file:com.edwardsit.spark4n6.EWFFileReaderTest.java

License:Apache License

@Test
public void testGetEWFSection() throws IOException {
    log.setLevel(Level.DEBUG);/*from  w w  w . ja  v  a2  s .  co  m*/
    Logger.getLogger("com.edwardsit.spark4n6")
            .addAppender(new RollingFileAppender(new PatternLayout(), "debug.log"));
    Configuration conf = new Configuration(false);
    Path path = new Path("../macwd.E01");
    // Path path = new Path("D:\\Users\\Derek\\Images\\500GB\\500GB-CDrive.E01");
    FileSystem fs = path.getFileSystem(conf);

    EWFFileReader reader = new EWFFileReader(fs, path);
    long size = reader.getImageSize();
    ArrayList<EWFSection.SectionPrefix> sections = reader.getSectionPrefixArray();
    Iterator<EWFSection.SectionPrefix> it = sections.iterator();
    EWFSection.SectionPrefix sp;
    long numSplits = 10L;
    long priorStart = 0L;
    long priorEnd = 0L;
    Path priorFile = null;
    log.debug(path.getName() + ": imageSize = " + size);
    log.debug("File\t\tChunkIndex\t\tSectionType\t\tChunkCount\t\tSectionSize");
    while (it.hasNext()) {
        sp = it.next();
        assertNotNull(sp);
        log.debug(sp.file + "\t\t" + sp.chunkIndex + "\t\t" + sp.sectionType + "\t\t" + sp.chunkCount + "\t\t"
                + sp.sectionSize);
        if (!sp.file.equals(priorFile) && sp.sectionType.equals(EWFSection.SectionType.TABLE_TYPE)) {
            if (priorFile != null) {
                priorEnd = sp.chunkIndex;
                // log.debug(priorFile + "Split#" + (numSplits * priorEnd * 64 * 512 / size) + ", " + priorStart + " to " + priorEnd);
            }
            priorFile = sp.file;
            priorStart = sp.chunkIndex;
        }
    }
    // log.debug(priorFile + " Split#" + (numSplits * priorEnd * 64 * 512 / size) + ", " + priorStart + " to " + size / 64 / 512);
}

From source file:com.edwardsit.spark4n6.EWFImageInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    log.setLevel(Level.DEBUG);/* w  w w  .  j a  va2s . co  m*/
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    BlockLocation[] blkLocations = null;
    Path path = null;
    FileSystem fs = null;
    EWFFileReader ewf = null;
    ArrayList<EWFSection.SectionPrefix> sections = null;
    Iterator<EWFSection.SectionPrefix> it = null;
    EWFSection.SectionPrefix sp = null;
    Path priorFile = null;
    long priorOffset = 0L;
    FileStatus priorFileStatus = null;
    chunkSize = new EWFSegmentFileReader(fs).DEFAULT_CHUNK_SIZE;
    long priorStart = 0L;
    int blkIndex = 0;
    for (FileStatus file : files) {
        path = file.getPath();
        fs = path.getFileSystem(job.getConfiguration());
        if (path.getName().endsWith(".E01")) {

            ewf = new EWFFileReader(fs, path);
            sections = ewf.getSectionPrefixArray();
            it = sections.iterator();
            while (it.hasNext()) {
                sp = it.next();
                if (sp.sectionType.equals(EWFSection.SectionType.TABLE_TYPE)) {
                    priorFileStatus = fs.getFileStatus(priorFile);
                    for (long i = sp.chunkCount; i > 0L; i = i - getChunksPerSplit(priorFileStatus)) {
                        if (priorFileStatus instanceof LocatedFileStatus) {
                            blkLocations = ((LocatedFileStatus) priorFileStatus).getBlockLocations();
                        } else {
                            blkLocations = fs.getFileBlockLocations(priorFileStatus, priorOffset,
                                    (getChunksPerSplit(priorFileStatus) * chunkSize));
                        }
                        blkIndex = getBlockIndex(blkLocations, priorOffset);
                        if (i > getChunksPerSplit(priorFileStatus)) {
                            log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize)
                                    + ", " + (getChunksPerSplit(priorFileStatus) * chunkSize) + ", "
                                    + listHosts(blkLocations, blkIndex) + ");");
                            splits.add(makeSplit(priorFile, (priorStart * chunkSize),
                                    (getChunksPerSplit(priorFileStatus) * chunkSize),
                                    blkLocations[blkIndex].getHosts()));
                            priorStart += getChunksPerSplit(priorFileStatus);
                        } else {
                            log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize)
                                    + ", " + (i * chunkSize) + ", " + listHosts(blkLocations, blkIndex) + ");");
                            splits.add(makeSplit(priorFile, (priorStart * chunkSize), (i * chunkSize),
                                    blkLocations[blkIndex].getHosts()));
                            priorStart += i;
                        }
                    }
                }
                priorFile = sp.file;
                priorOffset = sp.fileOffset;
            }
        }
    }
    return splits;
}

From source file:com.elex.dmp.core.TopicModel.java

License:Apache License

public void persist(Path outputDir, boolean overwrite) throws IOException {
    FileSystem fs = outputDir.getFileSystem(conf);
    if (overwrite) {
        fs.delete(outputDir, true); // CHECK second arg
    }//from   ww  w.ja va2  s .c  om
    DistributedRowMatrixWriter.write(outputDir, conf, topicTermCounts);
}