List of usage examples for org.apache.hadoop.fs FileSystem isFile
@Deprecated public boolean isFile(Path f) throws IOException
From source file:org.talend.components.test.MiniDfsResource.java
License:Open Source License
/** * Tests that a file on the HDFS cluster contains the given texts. * * @param path the name of the file on the HDFS cluster * @param expected the expected lines in the file (not including terminating end-of-lines). *//*from w w w . j a v a 2 s .c om*/ public static void assertReadFile(String recordDelimiter, FileSystem fs, String path, String... expected) throws IOException { Path p = new Path(path); if (fs.isFile(p)) { try (BufferedReader r = new BufferedReader(new InputStreamReader(fs.open(new Path(path))))) { Scanner s = new Scanner(r).useDelimiter(recordDelimiter); for (String line : expected) { assertThat(s.next(), is(line)); } assertThat(s.hasNext(), is(false)); } } else if (fs.isDirectory(p)) { HashSet<String> expect = new HashSet<>(Arrays.asList(expected)); for (FileStatus fstatus : fs.listStatus(p)) { try (BufferedReader r = new BufferedReader(new InputStreamReader(fs.open(fstatus.getPath())))) { Scanner s = new Scanner(r).useDelimiter(recordDelimiter); String line = null; while (s.hasNext()) { line = s.next(); if (!expect.remove(line)) fail("Unexpected line: " + line); } } } // Check before asserting for the message. if (expect.size() != 0) assertThat("Not all lines found: " + expect.iterator().next(), expect, hasSize(0)); } else { fail("No such path: " + path); } }
From source file:org.talend.components.test.MiniDfsResource.java
License:Open Source License
/** * Tests that a file on the HDFS cluster contains the given avro. * * @param path the name of the file on the HDFS cluster * @param expected the expected avro record in the file . *//* www .j a v a 2 s.c o m*/ public static void assertReadAvroFile(FileSystem fs, String path, Set<IndexedRecord> expected, boolean part) throws IOException { Path p = new Path(path); if (fs.isFile(p)) { try (DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>( new BufferedInputStream(fs.open(new Path(path))), new GenericDatumReader<GenericRecord>())) { IndexedRecord record = null; while (reader.hasNext()) { record = reader.iterator().next(); IndexedRecord eqRecord = null; for (IndexedRecord indexedRecord : expected) { if (indexedRecord.equals(record)) { eqRecord = indexedRecord; break; } } expected.remove(eqRecord); } } // Check before asserting for the message. if (!part && expected.size() != 0) assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0)); } else if (fs.isDirectory(p)) { for (FileStatus fstatus : FileSystemUtil.listSubFiles(fs, p)) { assertReadAvroFile(fs, fstatus.getPath().toString(), expected, true); } // Check before asserting for the message. if (expected.size() != 0) assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0)); } else { fail("No such path: " + path); } }
From source file:org.talend.components.test.MiniDfsResource.java
License:Open Source License
/** * Tests that a file on the HDFS cluster contains the given parquet. * * @param path the name of the file on the HDFS cluster * @param expected the expected avro record in the file . *//*from w w w . ja v a2 s . c o m*/ public static void assertReadParquetFile(FileSystem fs, String path, Set<IndexedRecord> expected, boolean part) throws IOException { Path p = new Path(path); if (fs.isFile(p)) { try (AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(fs.getConf(), new Path(path))) { IndexedRecord record = null; while (null != (record = reader.read())) { IndexedRecord eqRecord = null; for (IndexedRecord indexedRecord : expected) { if (indexedRecord.equals(record)) { eqRecord = indexedRecord; break; } } expected.remove(eqRecord); } } // Check before asserting for the message. if (!part && expected.size() != 0) assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0)); } else if (fs.isDirectory(p)) { for (FileStatus fstatus : FileSystemUtil.listSubFiles(fs, p)) { assertReadParquetFile(fs, fstatus.getPath().toString(), expected, true); } // Check before asserting for the message. if (expected.size() != 0) assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0)); } else { fail("No such path: " + path); } }
From source file:org.trustedanalytics.resourceserver.data.InputStreamProvider.java
License:Apache License
/** * Gets an InputStream for a path on HDFS. * * If given path is a directory, it will read files inside that dir and create * a SequenceInputStream from them, which emulates reading from directory just like from * a regular file. Notice that this method is not meant to read huge datasets * (as well as the whole project).//from w ww. j a v a 2 s . com * @param path * @return * @throws IOException */ public InputStream getInputStream(Path path) throws IOException { Objects.requireNonNull(path); FileSystem fs = hdfsConfig.getFileSystem(); if (fs.isFile(path)) { return fs.open(path); } else if (fs.isDirectory(path)) { FileStatus[] files = fs.listStatus(path); List<InputStream> paths = Arrays.stream(files).map(f -> { try { return fs.open(f.getPath()); } catch (IOException e) { LOGGER.log(Level.SEVERE, "Cannot read file " + f.getPath().toString(), e); return null; } }).filter(f -> f != null).collect(Collectors.toList()); return new SequenceInputStream(Collections.enumeration(paths)); } else { throw new IllegalArgumentException("Given path " + path.toString() + " is neither file nor directory"); } }
From source file:pl.edu.icm.coansys.statisticsgenerator.tools.ViewTool.java
License:Open Source License
private static void processFileOrDirectory(Path pt, Configuration conf) throws IOException { FileSystem fs = pt.getFileSystem(conf); if (fs.isDirectory(pt)) { for (FileStatus fstat : fs.listStatus(pt)) { processFileOrDirectory(fstat.getPath(), conf); }/*w w w. j a v a2 s . com*/ } else if (fs.isFile(pt)) { viewFile(pt, conf); } else { //zaloguj bd } }
From source file:streaming.core.DownloadRunner.java
License:Apache License
public static int getTarFileByPath(HttpServletResponse res, String pathStr) { String[] paths = pathStr.split(","); try {/*from w w w.j a va2s . c om*/ OutputStream outputStream = res.getOutputStream(); TarOutputStream tarOutputStream = new TarOutputStream(new BufferedOutputStream(outputStream)); FileSystem fs = FileSystem.get(new Configuration()); List<FileStatus> files = new ArrayList<FileStatus>(); for (String path : paths) { Path p = new Path(path); if (fs.exists(p)) { if (fs.isFile(p)) { files.add(fs.getFileStatus(p)); } else if (fs.isDirectory(p)) { FileStatus[] fileStatusArr = fs.listStatus(p); if (fileStatusArr != null && fileStatusArr.length > 0) { for (FileStatus cur : fileStatusArr) { if (cur.isFile()) { files.add(cur); } } } } } } if (files.size() > 0) { FSDataInputStream inputStream = null; int len = files.size(); int i = 1; for (FileStatus cur : files) { logger.info("[" + i++ + "/" + len + "]" + ",?" + cur); inputStream = fs.open(cur.getPath()); tarOutputStream.putNextEntry(new HDFSTarEntry(cur, cur.getPath().getName())); org.apache.commons.io.IOUtils.copyLarge(inputStream, tarOutputStream); inputStream.close(); } tarOutputStream.flush(); tarOutputStream.close(); return 200; } else return 400; } catch (Exception e) { e.printStackTrace(); return 500; } }
From source file:streaming.core.DownloadRunner.java
License:Apache License
public static int getRawFileByPath(HttpServletResponse res, String path, long position) { try {// w ww. j a va 2s . co m FileSystem fs = FileSystem.get(new Configuration()); Path p = new Path(path); if (fs.exists(p)) { List<FileStatus> files = new ArrayList<FileStatus>(); // if (fs.isFile(p)) { files.add(fs.getFileStatus(p)); } else if (fs.isDirectory(p)) { FileStatus[] fileStatusArr = fs.listStatus(p); if (fileStatusArr != null && fileStatusArr.length > 0) { for (FileStatus cur : fileStatusArr) { files.add(cur); } } } //?? if (files.size() > 0) { logger.info(path + "" + files.size()); FSDataInputStream inputStream = null; OutputStream outputStream = res.getOutputStream(); int len = files.size(); int i = 1; long allPosition = 0; for (FileStatus cur : files) { logger.info("[" + i++ + "/" + len + "]" + path + ",?" + cur); inputStream = fs.open(cur.getPath()); if (position > 0) { if (allPosition + cur.getLen() > position) { inputStream.seek(position - allPosition); logger.info("seek position " + (position - allPosition)); position = -1; } allPosition += cur.getLen(); } org.apache.commons.io.IOUtils.copyLarge(inputStream, outputStream); inputStream.close(); } outputStream.flush(); outputStream.close(); return 200; } else { logger.info(path + "" + files.size()); } } else { return 400; } } catch (Exception e) { e.printStackTrace(); } return 500; }
From source file:uk.bl.wa.hadoop.mapreduce.hash.HdfsFileHasher.java
License:Open Source License
@Override public int run(String[] args) throws Exception { // Options://from w w w . j av a 2 s . c o m String[] otherArgs = new GenericOptionsParser(args).getRemainingArgs(); // Process remaining args list this: Options options = new Options(); options.addOption("i", true, "a local file containing a list of HDFS paths to process"); options.addOption("o", true, "output directory"); options.addOption("m", false, "use MD5 rather than SHA-512"); options.addOption("r", true, "number of reducers (defaults to 1)"); CommandLineParser parser = new PosixParser(); CommandLine cmd = parser.parse(options, otherArgs); if (!cmd.hasOption("i") || !cmd.hasOption("o")) { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.setWidth(80); helpFormatter.printHelp(CLI_USAGE, CLI_HEADER, options, ""); System.exit(1); } String input_file = cmd.getOptionValue("i"); String output_path = cmd.getOptionValue("o"); String algorithm = null; int numReducers = 1; if (cmd.hasOption("m")) { algorithm = "MD5"; } if (cmd.hasOption("r")) { numReducers = Integer.parseInt(cmd.getOptionValue("r")); } // When implementing tool, choose algorithm: Configuration conf = this.getConf(); if (algorithm != null) conf.set(MessageDigestMapper.CONFIG_DIGEST_ALGORITHM, algorithm); // Create job Job job = new Job(conf, "HDFS File Checksummer"); job.setJarByClass(HdfsFileHasher.class); // Setup MapReduce job // Do not specify the number of Reducer job.setMapperClass(MessageDigestMapper.class); job.setReducerClass(Reducer.class); // Just one output file: job.setNumReduceTasks(numReducers); // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Input log.info("Reading input files..."); String line = null; long line_count = 0; BufferedReader br = new BufferedReader(new FileReader(input_file)); while ((line = br.readLine()) != null) { if (StringUtils.isEmpty(line)) continue; // line_count++; Path path = new Path(line); FileSystem fs = path.getFileSystem(conf); if (fs.isFile(path)) { FileInputFormat.addInputPath(job, path); } else if (fs.isDirectory(path)) { FileStatus[] listing = fs.listStatus(path); int list_count = 0; for (FileStatus fstat : listing) { list_count++; log.info("Checking " + list_count + "/" + listing.length + " " + fstat.getPath()); if (!fstat.isDir()) { FileInputFormat.addInputPath(job, fstat.getPath()); } } } } br.close(); log.info("Read " + FileInputFormat.getInputPaths(job).length + " input files from " + line_count + " paths."); job.setInputFormatClass(UnsplittableInputFileFormat.class); // Output FileOutputFormat.setOutputPath(job, new Path(output_path)); job.setOutputFormatClass(TextOutputFormat.class); // Execute job and return status return job.waitForCompletion(true) ? 0 : 1; }
From source file:voldemort.store.readonly.fetcher.HdfsFetcher.java
License:Apache License
private boolean fetch(FileSystem fs, Path source, File dest, CopyStats stats) throws IOException { if (!fs.isFile(source)) { Utils.mkdirs(dest);// w w w.j a va 2s .c o m FileStatus[] statuses = fs.listStatus(source); if (statuses != null) { // sort the files so that index files come last. Maybe // this will help keep them cached until the swap Arrays.sort(statuses, new IndexFileLastComparator()); byte[] origCheckSum = null; CheckSumType checkSumType = CheckSumType.NONE; // Do a checksum of checksum - Similar to HDFS CheckSum checkSumGenerator = null; CheckSum fileCheckSumGenerator = null; for (FileStatus status : statuses) { // Kept for backwards compatibility if (status.getPath().getName().contains("checkSum.txt")) { // Ignore old checksum files } else if (status.getPath().getName().contains(".metadata")) { logger.debug("Reading .metadata"); // Read metadata into local file File copyLocation = new File(dest, status.getPath().getName()); copyFileWithCheckSum(fs, status.getPath(), copyLocation, stats, null); // Open the local file to initialize checksum ReadOnlyStorageMetadata metadata; try { metadata = new ReadOnlyStorageMetadata(copyLocation); } catch (IOException e) { logger.error("Error reading metadata file ", e); throw new VoldemortException(e); } // Read checksum String checkSumTypeString = (String) metadata.get(ReadOnlyStorageMetadata.CHECKSUM_TYPE); String checkSumString = (String) metadata.get(ReadOnlyStorageMetadata.CHECKSUM); if (checkSumTypeString != null && checkSumString != null) { try { origCheckSum = Hex.decodeHex(checkSumString.toCharArray()); } catch (DecoderException e) { logger.error("Exception reading checksum file. Ignoring checksum ", e); continue; } logger.debug("Checksum from .metadata " + new String(Hex.encodeHex(origCheckSum))); checkSumType = CheckSum.fromString(checkSumTypeString); checkSumGenerator = CheckSum.getInstance(checkSumType); fileCheckSumGenerator = CheckSum.getInstance(checkSumType); } } else if (!status.getPath().getName().startsWith(".")) { // Read other (.data , .index files) File copyLocation = new File(dest, status.getPath().getName()); copyFileWithCheckSum(fs, status.getPath(), copyLocation, stats, fileCheckSumGenerator); if (fileCheckSumGenerator != null && checkSumGenerator != null) { byte[] checkSum = fileCheckSumGenerator.getCheckSum(); logger.debug("Checksum for " + status.getPath() + " - " + new String(Hex.encodeHex(checkSum))); checkSumGenerator.update(checkSum); } } } logger.info( "Completed reading all files from " + source.toString() + " to " + dest.getAbsolutePath()); // Check checksum if (checkSumType != CheckSumType.NONE) { byte[] newCheckSum = checkSumGenerator.getCheckSum(); boolean checkSumComparison = (ByteUtils.compare(newCheckSum, origCheckSum) == 0); logger.info("Checksum generated from streaming - " + new String(Hex.encodeHex(newCheckSum))); logger.info("Checksum on file - " + new String(Hex.encodeHex(origCheckSum))); logger.info("Check-sum verification - " + checkSumComparison); return checkSumComparison; } else { logger.info("No check-sum verification required"); return true; } } } logger.error("Source " + source.toString() + " should be a directory"); return false; }
From source file:voldemort.store.readonly.mr.utils.AvroUtils.java
License:Apache License
/** * Pull the schema off of the given file (if it is a file). If it is a * directory, then pull schemas off of all subfiles, and check that they are * all the same schema. If so, return that schema, otherwise throw an * exception//from www. j ava2 s. co m * * @param fs The filesystem to use * @param path The path from which to get the schema * @param checkSameSchema boolean flag to check all files in directory for * same schema * @return The schema of this file or all its subfiles * @throws IOException */ @SuppressWarnings({ "unchecked", "rawtypes" }) private static Schema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema) { try { if (fs.isFile(path)) { BufferedInputStream inStream = null; try { inStream = new BufferedInputStream(fs.open(path)); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } GenericDatumReader datum = new GenericDatumReader(); DataFileStream reader = null; try { reader = new DataFileStream(inStream, datum); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return reader.getSchema(); } else { FileStatus[] statuses = null; if (fs.isDirectory(path)) { // this is a directory, get schemas from all subfiles statuses = fs.listStatus(path); } else { // this is wildcard path, get schemas from all matched files statuses = fs.globStatus(path); } if (statuses == null || statuses.length == 0) throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath()); List<Schema> schemas = new ArrayList<Schema>(); for (FileStatus status : statuses) { if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) { if (!checkSameSchema) { // return first valid schema w/o checking all files return getSchemaFromPath(fs, status.getPath(), checkSameSchema); } schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema)); } } // now check that all the schemas are the same if (schemas.size() > 0) { Schema schema = schemas.get(0); for (int i = 1; i < schemas.size(); i++) if (!schema.equals(schemas.get(i))) throw new IllegalArgumentException("The directory " + path.toString() + " contains heterogenous schemas: found both '" + schema.toString() + "' and '" + schemas.get(i).toString() + "'."); return schema; } else { throw new IllegalArgumentException("No Valid metadata file found for Path:" + path.toString()); } } } catch (Exception e) { // logger.error("failed to get metadata from path:" + path); throw new RuntimeException(e); } }