Example usage for org.apache.hadoop.fs Path getFileSystem

List of usage examples for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException 

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:ca.uwaterloo.iss4e.hadoop.io.CartesianInputFormat.java

License:Open Source License

private List<InputSplit> getInputSplits(JobContext jobContext, String inputFormatClass, Path path)
        throws ClassNotFoundException, IOException {
    Configuration conf = jobContext.getConfiguration();
    FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass),
            conf);//w  ww  .  j a va 2s. co  m

    // Set the input path for the left data set
    path = path.getFileSystem(conf).makeQualified(path);
    String dirStr = StringUtils.escapeString(path.toString());
    String dirs = conf.get(INPUT_DIR);
    conf.set(INPUT_DIR, dirStr);
    return inputFormat.getSplits(jobContext);
}

From source file:cascading.avro.AvroScheme.java

License:Apache License

/**
 * This method peeks at the source data to get a schema when none has been provided.
 *
 * @param flowProcess The cascading FlowProcess object for this flow.
 * @param tap         The cascading Tap object.
 * @return Schema The schema of the peeked at data, or Schema.NULL if none exists.
 *//*  ww  w.ja  v  a 2  s  . co  m*/
private Schema getSourceSchema(FlowProcess<JobConf> flowProcess, Tap tap) throws IOException {

    if (tap instanceof CompositeTap) {
        tap = (Tap) ((CompositeTap) tap).getChildTaps().next();
    }
    final String path = tap.getIdentifier();
    Path p = new Path(path);
    final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy());
    // Get all the input dirs
    List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter)));
    // Now get all the things that are one level down
    for (FileStatus status : new LinkedList<FileStatus>(statuses)) {
        if (status.isDir())
            for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) {
                if (child.isDir()) {
                    statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter)));
                } else if (fs.isFile(child.getPath())) {
                    statuses.add(child);
                }
            }
    }
    for (FileStatus status : statuses) {
        Path statusPath = status.getPath();
        if (fs.isFile(statusPath)) {
            // no need to open them all
            InputStream stream = null;
            DataFileStream reader = null;
            try {
                stream = new BufferedInputStream(fs.open(statusPath));
                reader = new DataFileStream(stream, new GenericDatumReader());
                return reader.getSchema();
            } finally {
                if (reader == null) {
                    if (stream != null) {
                        stream.close();
                    }
                } else {
                    reader.close();
                }
            }

        }
    }
    // couldn't find any Avro files, return null schema
    return Schema.create(Schema.Type.NULL);
}

From source file:cascading.flow.hadoop.util.HadoopUtil.java

License:Open Source License

private static FileSystem getFileSystem(Configuration config, Path path) {
    try {/*  w w  w .jav  a 2 s  .  c o  m*/
        return path.getFileSystem(config);
    } catch (IOException exception) {
        throw new FlowException("unable to get handle to underlying filesystem", exception);
    }
}

From source file:cascading.hcatalog.CascadingHCatUtil.java

License:Apache License

protected static List<String> getFilesInHivePartition(Partition part, JobConf jobConf) {
    List<String> result = newArrayList();

    String ignoreFileRegex = jobConf.get(HCatTap.IGNORE_FILE_IN_PARTITION_REGEX, "");
    Pattern ignoreFilePattern = Pattern.compile(ignoreFileRegex);

    try {/*from w ww . j av a 2 s.c o  m*/
        Path partitionDirPath = new Path(part.getSd().getLocation());
        FileStatus[] partitionContent = partitionDirPath.getFileSystem(jobConf).listStatus(partitionDirPath);
        for (FileStatus currStatus : partitionContent) {
            if (!currStatus.isDir()) {
                if (!ignoreFilePattern.matcher(currStatus.getPath().getName()).matches()) {
                    result.add(currStatus.getPath().toUri().getPath());
                } else {
                    LOG.debug("Ignoring path {} since matches ignore regex {}",
                            currStatus.getPath().toUri().getPath(), ignoreFileRegex);
                }
            }
        }

    } catch (IOException e) {
        logError("Unable to read the content of partition '" + part.getSd().getLocation() + "'", e);
    }

    return result;
}

From source file:cascading.scheme.DeprecatedAvroScheme.java

License:Apache License

/**
 * This method peeks at the source data to get a schema when none has been provided.
 *
 * @param flowProcess The cascading FlowProcess object for this flow.
 * @param tap         The cascading Tap object.
 * @return Schema The schema of the peeked at data, or Schema.NULL if none exists.
 *//*w  ww  .  j  a  v  a  2  s  .  c om*/
private Schema getSourceSchema(FlowProcess<? extends Configuration> flowProcess, Tap tap) throws IOException {

    if (tap instanceof CompositeTap) {
        tap = (Tap) ((CompositeTap) tap).getChildTaps().next();
    }
    final String path = tap.getIdentifier();
    Path p = new Path(path);
    final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy());
    // Get all the input dirs
    List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter)));
    // Now get all the things that are one level down
    for (FileStatus status : new LinkedList<FileStatus>(statuses)) {
        if (status.isDir())
            for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) {
                if (child.isDir()) {
                    statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter)));
                } else if (fs.isFile(child.getPath())) {
                    statuses.add(child);
                }
            }
    }
    for (FileStatus status : statuses) {
        Path statusPath = status.getPath();
        if (fs.isFile(statusPath)) {
            // no need to open them all
            InputStream stream = null;
            DataFileStream reader = null;
            try {
                stream = new BufferedInputStream(fs.open(statusPath));
                reader = new DataFileStream(stream, new GenericDatumReader());
                return reader.getSchema();
            } finally {
                if (reader == null) {
                    if (stream != null) {
                        stream.close();
                    }
                } else {
                    reader.close();
                }
            }

        }
    }
    // couldn't find any Avro files, return null schema
    return Schema.create(Schema.Type.NULL);
}

From source file:cascading.tap.GlobHfs.java

License:Open Source License

private Tap[] makeTaps(JobConf conf) throws IOException {
    FileStatus[] statusList = null;/*from  ww w.  j a va2  s  . c  o m*/

    Path path = new Path(pathPattern);

    FileSystem fileSystem = path.getFileSystem(conf);

    if (pathFilter == null)
        statusList = fileSystem.globStatus(path);
    else
        statusList = fileSystem.globStatus(path, pathFilter);

    if (statusList == null || statusList.length == 0)
        throw new TapException("unable to find paths matching path pattern: " + pathPattern);

    List<Hfs> notEmpty = new ArrayList<Hfs>();

    for (int i = 0; i < statusList.length; i++) {
        // remove empty files. turns out a directory returns a length not zero
        // so this jives with the expectations set in the above javadoc
        if (statusList[i].getLen() != 0)
            notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString()));
    }

    if (notEmpty.isEmpty())
        throw new TapException("all paths matching path pattern are zero length: " + pathPattern);

    return notEmpty.toArray(new Tap[notEmpty.size()]);
}

From source file:cascading.tap.hadoop.GlobHfs.java

License:Open Source License

private Hfs[] makeTaps(Configuration conf) throws IOException {
    FileStatus[] statusList;//from w w w . ja  v a  2 s  .c o m

    Path path = new Path(pathPattern);

    FileSystem fileSystem = path.getFileSystem(conf);

    if (pathFilter == null)
        statusList = fileSystem.globStatus(path);
    else
        statusList = fileSystem.globStatus(path, pathFilter);

    if (statusList == null || statusList.length == 0)
        throw new TapException("unable to find paths matching path pattern: " + pathPattern);

    List<Hfs> notEmpty = new ArrayList<Hfs>();

    for (int i = 0; i < statusList.length; i++) {
        // remove empty files. some hadoop versions return non-zero for dirs
        // so this jives with the expectations set in the above javadoc
        if (statusList[i].isDir() || statusList[i].getLen() != 0)
            notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString()));
    }

    if (notEmpty.isEmpty())
        throw new TapException(
                "all paths matching path pattern are zero length and not directories: " + pathPattern);

    return notEmpty.toArray(new Hfs[notEmpty.size()]);
}

From source file:cascading.tap.hadoop.Hadoop18TapUtil.java

License:Open Source License

private static FileSystem getFSSafe(JobConf conf, Path tmpDir) {
    try {/*w  w  w. j  a v a  2  s  .  c  o m*/
        return tmpDir.getFileSystem(conf);
    } catch (IOException e) {
        // ignore
    }

    return null;
}

From source file:cascading.tap.hadoop.Hadoop18TapUtil.java

License:Open Source License

private static Path getTaskOutputPath(JobConf conf) {
    String taskId = conf.get("mapred.task.id");

    Path p = new Path(FileOutputFormat.getOutputPath(conf), TEMPORARY_PATH + Path.SEPARATOR + "_" + taskId);

    try {/*from   w  w  w.j  av a 2 s .c  o  m*/
        FileSystem fs = p.getFileSystem(conf);
        return p.makeQualified(fs);
    } catch (IOException ie) {
        return p;
    }
}

From source file:cascading.tap.hadoop.Hadoop18TapUtil.java

License:Open Source License

public static void makeTempPath(JobConf conf) throws IOException {
    // create job specific temporary directory in output path
    Path outputPath = FileOutputFormat.getOutputPath(conf);

    if (outputPath != null) {
        Path tmpDir = new Path(outputPath, TEMPORARY_PATH);
        FileSystem fileSys = tmpDir.getFileSystem(conf);

        if (!fileSys.exists(tmpDir) && !fileSys.mkdirs(tmpDir)) {
            LOG.error("mkdirs failed to create " + tmpDir.toString());
        }//  www .j a  v  a 2  s . co  m
    }
}