List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:ca.uwaterloo.iss4e.hadoop.io.CartesianInputFormat.java
License:Open Source License
private List<InputSplit> getInputSplits(JobContext jobContext, String inputFormatClass, Path path) throws ClassNotFoundException, IOException { Configuration conf = jobContext.getConfiguration(); FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass), conf);//w ww . j a va 2s. co m // Set the input path for the left data set path = path.getFileSystem(conf).makeQualified(path); String dirStr = StringUtils.escapeString(path.toString()); String dirs = conf.get(INPUT_DIR); conf.set(INPUT_DIR, dirStr); return inputFormat.getSplits(jobContext); }
From source file:cascading.avro.AvroScheme.java
License:Apache License
/** * This method peeks at the source data to get a schema when none has been provided. * * @param flowProcess The cascading FlowProcess object for this flow. * @param tap The cascading Tap object. * @return Schema The schema of the peeked at data, or Schema.NULL if none exists. *//* ww w.ja v a 2 s . co m*/ private Schema getSourceSchema(FlowProcess<JobConf> flowProcess, Tap tap) throws IOException { if (tap instanceof CompositeTap) { tap = (Tap) ((CompositeTap) tap).getChildTaps().next(); } final String path = tap.getIdentifier(); Path p = new Path(path); final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy()); // Get all the input dirs List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter))); // Now get all the things that are one level down for (FileStatus status : new LinkedList<FileStatus>(statuses)) { if (status.isDir()) for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) { if (child.isDir()) { statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter))); } else if (fs.isFile(child.getPath())) { statuses.add(child); } } } for (FileStatus status : statuses) { Path statusPath = status.getPath(); if (fs.isFile(statusPath)) { // no need to open them all InputStream stream = null; DataFileStream reader = null; try { stream = new BufferedInputStream(fs.open(statusPath)); reader = new DataFileStream(stream, new GenericDatumReader()); return reader.getSchema(); } finally { if (reader == null) { if (stream != null) { stream.close(); } } else { reader.close(); } } } } // couldn't find any Avro files, return null schema return Schema.create(Schema.Type.NULL); }
From source file:cascading.flow.hadoop.util.HadoopUtil.java
License:Open Source License
private static FileSystem getFileSystem(Configuration config, Path path) { try {/* w w w .jav a 2 s . c o m*/ return path.getFileSystem(config); } catch (IOException exception) { throw new FlowException("unable to get handle to underlying filesystem", exception); } }
From source file:cascading.hcatalog.CascadingHCatUtil.java
License:Apache License
protected static List<String> getFilesInHivePartition(Partition part, JobConf jobConf) { List<String> result = newArrayList(); String ignoreFileRegex = jobConf.get(HCatTap.IGNORE_FILE_IN_PARTITION_REGEX, ""); Pattern ignoreFilePattern = Pattern.compile(ignoreFileRegex); try {/*from w ww . j av a 2 s.c o m*/ Path partitionDirPath = new Path(part.getSd().getLocation()); FileStatus[] partitionContent = partitionDirPath.getFileSystem(jobConf).listStatus(partitionDirPath); for (FileStatus currStatus : partitionContent) { if (!currStatus.isDir()) { if (!ignoreFilePattern.matcher(currStatus.getPath().getName()).matches()) { result.add(currStatus.getPath().toUri().getPath()); } else { LOG.debug("Ignoring path {} since matches ignore regex {}", currStatus.getPath().toUri().getPath(), ignoreFileRegex); } } } } catch (IOException e) { logError("Unable to read the content of partition '" + part.getSd().getLocation() + "'", e); } return result; }
From source file:cascading.scheme.DeprecatedAvroScheme.java
License:Apache License
/** * This method peeks at the source data to get a schema when none has been provided. * * @param flowProcess The cascading FlowProcess object for this flow. * @param tap The cascading Tap object. * @return Schema The schema of the peeked at data, or Schema.NULL if none exists. *//*w ww . j a v a 2 s . c om*/ private Schema getSourceSchema(FlowProcess<? extends Configuration> flowProcess, Tap tap) throws IOException { if (tap instanceof CompositeTap) { tap = (Tap) ((CompositeTap) tap).getChildTaps().next(); } final String path = tap.getIdentifier(); Path p = new Path(path); final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy()); // Get all the input dirs List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter))); // Now get all the things that are one level down for (FileStatus status : new LinkedList<FileStatus>(statuses)) { if (status.isDir()) for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) { if (child.isDir()) { statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter))); } else if (fs.isFile(child.getPath())) { statuses.add(child); } } } for (FileStatus status : statuses) { Path statusPath = status.getPath(); if (fs.isFile(statusPath)) { // no need to open them all InputStream stream = null; DataFileStream reader = null; try { stream = new BufferedInputStream(fs.open(statusPath)); reader = new DataFileStream(stream, new GenericDatumReader()); return reader.getSchema(); } finally { if (reader == null) { if (stream != null) { stream.close(); } } else { reader.close(); } } } } // couldn't find any Avro files, return null schema return Schema.create(Schema.Type.NULL); }
From source file:cascading.tap.GlobHfs.java
License:Open Source License
private Tap[] makeTaps(JobConf conf) throws IOException { FileStatus[] statusList = null;/*from ww w. j a va2 s . c o m*/ Path path = new Path(pathPattern); FileSystem fileSystem = path.getFileSystem(conf); if (pathFilter == null) statusList = fileSystem.globStatus(path); else statusList = fileSystem.globStatus(path, pathFilter); if (statusList == null || statusList.length == 0) throw new TapException("unable to find paths matching path pattern: " + pathPattern); List<Hfs> notEmpty = new ArrayList<Hfs>(); for (int i = 0; i < statusList.length; i++) { // remove empty files. turns out a directory returns a length not zero // so this jives with the expectations set in the above javadoc if (statusList[i].getLen() != 0) notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString())); } if (notEmpty.isEmpty()) throw new TapException("all paths matching path pattern are zero length: " + pathPattern); return notEmpty.toArray(new Tap[notEmpty.size()]); }
From source file:cascading.tap.hadoop.GlobHfs.java
License:Open Source License
private Hfs[] makeTaps(Configuration conf) throws IOException { FileStatus[] statusList;//from w w w . ja v a 2 s .c o m Path path = new Path(pathPattern); FileSystem fileSystem = path.getFileSystem(conf); if (pathFilter == null) statusList = fileSystem.globStatus(path); else statusList = fileSystem.globStatus(path, pathFilter); if (statusList == null || statusList.length == 0) throw new TapException("unable to find paths matching path pattern: " + pathPattern); List<Hfs> notEmpty = new ArrayList<Hfs>(); for (int i = 0; i < statusList.length; i++) { // remove empty files. some hadoop versions return non-zero for dirs // so this jives with the expectations set in the above javadoc if (statusList[i].isDir() || statusList[i].getLen() != 0) notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString())); } if (notEmpty.isEmpty()) throw new TapException( "all paths matching path pattern are zero length and not directories: " + pathPattern); return notEmpty.toArray(new Hfs[notEmpty.size()]); }
From source file:cascading.tap.hadoop.Hadoop18TapUtil.java
License:Open Source License
private static FileSystem getFSSafe(JobConf conf, Path tmpDir) { try {/*w w w. j a v a 2 s . c o m*/ return tmpDir.getFileSystem(conf); } catch (IOException e) { // ignore } return null; }
From source file:cascading.tap.hadoop.Hadoop18TapUtil.java
License:Open Source License
private static Path getTaskOutputPath(JobConf conf) { String taskId = conf.get("mapred.task.id"); Path p = new Path(FileOutputFormat.getOutputPath(conf), TEMPORARY_PATH + Path.SEPARATOR + "_" + taskId); try {/*from w w w.j av a 2 s .c o m*/ FileSystem fs = p.getFileSystem(conf); return p.makeQualified(fs); } catch (IOException ie) { return p; } }
From source file:cascading.tap.hadoop.Hadoop18TapUtil.java
License:Open Source License
public static void makeTempPath(JobConf conf) throws IOException { // create job specific temporary directory in output path Path outputPath = FileOutputFormat.getOutputPath(conf); if (outputPath != null) { Path tmpDir = new Path(outputPath, TEMPORARY_PATH); FileSystem fileSys = tmpDir.getFileSystem(conf); if (!fileSys.exists(tmpDir) && !fileSys.mkdirs(tmpDir)) { LOG.error("mkdirs failed to create " + tmpDir.toString()); }// www .j a v a 2 s . co m } }