List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:cn.lhfei.hadoop.ch04.FileDecompressor.java
License:Apache License
/** * use case: % hadoop FileDecompressor file.gz * @param args//from ww w . j a v a2s . com */ public static void main(String[] args) { FileSystem fs = null; String uri = args[0]; Path inputPath = null; Configuration conf = new Configuration(); CompressionCodecFactory factory = null; InputStream in = null; OutputStream out = null; try { fs = FileSystem.get(URI.create(uri), conf); inputPath = new Path(uri); factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/*from w w w . j a v a2s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key?? key = new Text(file.getName()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:cn.uc.hadoop.mapreduce.lib.input.FilePathLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/*from w w w . java 2 s . c om*/ end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key? key = new Text(file.toString()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:co.cask.cdap.common.io.Locations.java
License:Apache License
/** * Creates a new {@link InputSupplier} that can provides {@link SeekableInputStream} of the given path. * * @param fs The {@link org.apache.hadoop.fs.FileSystem} for the given path. * @param path The path to create {@link co.cask.cdap.common.io.SeekableInputStream} when requested. * @return A {@link InputSupplier}.//from ww w . j av a2 s . com */ public static InputSupplier<? extends SeekableInputStream> newInputSupplier(final FileSystem fs, final Path path) { return new InputSupplier<SeekableInputStream>() { @Override public SeekableInputStream getInput() throws IOException { FSDataInputStream input = fs.open(path); try { return new DFSSeekableInputStream(input, createDFSStreamSizeProvider(fs, path, input)); } catch (Throwable t) { Closeables.closeQuietly(input); Throwables.propagateIfInstanceOf(t, IOException.class); throw new IOException(t); } } }; }
From source file:co.cask.cdap.common.logging.SyncTest.java
License:Apache License
@Test @Ignore//from w ww .j a v a2 s . c om public void testSync() throws IOException { FileSystem fs = FileSystem.get(config); // create a file and write n bytes, then sync Path path = new Path("/myfile"); FSDataOutputStream out = fs.create(path, false, 4096, (short) 2, 4096L); int numBytes = 5000; for (int i = 0; i < numBytes; i++) { out.write((byte) i); } out.hflush(); // verify the file is there Assert.assertTrue(fs.exists(path)); // do not verify the length of the file, hflush() does not update that //Assert.assertEquals(numBytes, fs.getFileStatus(path).getLen()); // read back and verify all bytes FSDataInputStream in = fs.open(path); byte[] buffer = new byte[numBytes]; in.readFully(buffer); for (int i = 0; i < numBytes; i++) { Assert.assertEquals((byte) i, buffer[i]); } in.close(); // now close the writer out.close(); }
From source file:co.cask.hydrator.plugin.batch.action.FileAction.java
License:Apache License
@SuppressWarnings("ConstantConditions") @Override/* w ww .ja v a2s .co m*/ public void run(BatchActionContext context) throws Exception { if (!config.shouldRun(context)) { return; } config.substituteMacros(context); Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); FileSystem fileSystem = FileSystem.get(conf); Path[] paths; Path sourcePath = new Path(config.path); if (fileSystem.isDirectory(sourcePath)) { FileStatus[] status = fileSystem.listStatus(sourcePath); paths = FileUtil.stat2Paths(status); } else { paths = new Path[] { sourcePath }; } //get regex pattern for file name filtering. boolean patternSpecified = !Strings.isNullOrEmpty(config.pattern); if (patternSpecified) { regex = Pattern.compile(config.pattern); } switch (config.action.toLowerCase()) { case "delete": for (Path path : paths) { if (!patternSpecified || isFileNameMatch(path.getName())) { fileSystem.delete(path, true); } } break; case "move": for (Path path : paths) { if (!patternSpecified || isFileNameMatch(path.getName())) { Path targetFileMovePath = new Path(config.targetFolder, path.getName()); fileSystem.rename(path, targetFileMovePath); } } break; case "archive": for (Path path : paths) { if (!patternSpecified || isFileNameMatch(path.getName())) { try (FSDataOutputStream archivedStream = fileSystem .create(new Path(config.targetFolder, path.getName() + ".zip")); ZipOutputStream zipArchivedStream = new ZipOutputStream(archivedStream); FSDataInputStream fdDataInputStream = fileSystem.open(path)) { zipArchivedStream.putNextEntry(new ZipEntry(path.getName())); int length; byte[] buffer = new byte[1024]; while ((length = fdDataInputStream.read(buffer)) > 0) { zipArchivedStream.write(buffer, 0, length); } zipArchivedStream.closeEntry(); } fileSystem.delete(path, true); } } break; default: LOG.warn("No action required on the file."); break; } }
From source file:co.cask.hydrator.plugin.batch.CopybookRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // Get configuration Configuration conf = context.getConfiguration(); int fileStructure = net.sf.JRecord.Common.Constants.IO_FIXED_LENGTH; Path path = new Path(conf.get(CopybookInputFormat.COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH)); FileSystem fs = FileSystem.get(path.toUri(), conf); // Create input stream for the COBOL copybook contents InputStream inputStream = IOUtils .toInputStream(conf.get(CopybookInputFormat.COPYBOOK_INPUTFORMAT_CBL_CONTENTS), "UTF-8"); BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); try {/* w w w . j ava 2 s.c o m*/ externalRecord = CopybookIOUtils.getExternalRecord(bufferedInputStream); recordByteLength = CopybookIOUtils.getRecordLength(externalRecord, fileStructure); LineProvider lineProvider = LineIOProvider.getInstance().getLineProvider(fileStructure, CopybookIOUtils.FONT); reader = LineIOProvider.getInstance().getLineReader(fileStructure, lineProvider); LayoutDetail copybook = CopybookIOUtils.getLayoutDetail(externalRecord); org.apache.hadoop.mapreduce.lib.input.FileSplit fileSplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) split; start = fileSplit.getStart(); end = start + fileSplit.getLength(); BufferedInputStream fileIn = new BufferedInputStream(fs.open(fileSplit.getPath())); // Jump to the point in the split at which the first complete record of the split starts, // if not the first InputSplit if (start != 0) { position = start - (start % recordByteLength) + recordByteLength; fileIn.skip(position); } reader.open(fileIn, copybook); } catch (Exception e) { throw new RuntimeException(e); } }
From source file:co.cask.hydrator.plugin.db.batch.action.VerticaBulkImportAction.java
License:Apache License
@Override public void run(ActionContext context) throws Exception { Object driver = Class.forName("com.vertica.jdbc.Driver").newInstance(); DriverManager.registerDriver((Driver) driver); Preconditions.checkArgument(tableExists(config.tableName), "Table %s does not exist. Please check that the 'tableName' property " + "has been set correctly, and that the connection string %s points to a valid database.", config.tableName, config.connectionString); String copyStatement;/*from w w w. j a va 2 s. c om*/ if (config.level.equalsIgnoreCase("basic")) { // COPY tableName FROM STDIN DELIMITER 'delimiter' copyStatement = String.format("COPY %s FROM STDIN DELIMITER '%s'", config.tableName, config.delimiter); } else { copyStatement = config.copyStatement; } LOG.debug("Copy statement is: {}", copyStatement); try { try (Connection connection = DriverManager.getConnection(config.connectionString, config.user, config.password)) { connection.setAutoCommit(false); // run Copy statement VerticaCopyStream stream = new VerticaCopyStream((VerticaConnection) connection, copyStatement); // Keep running count of the number of rejects int totalRejects = 0; // start() starts the stream process, and opens the COPY command. stream.start(); FileSystem fs = FileSystem.get(new Configuration()); List<String> fileList = new ArrayList<>(); FileStatus[] fileStatus; try { fileStatus = fs.listStatus(new Path(config.path)); for (FileStatus fileStat : fileStatus) { fileList.add(fileStat.getPath().toString()); } } catch (FileNotFoundException e) { throw new IllegalArgumentException(String.format(String.format( "Path %s not found on file system. Please provide correct path.", config.path), e)); } if (fileStatus.length <= 0) { LOG.warn("No files available to load into vertica database"); } for (String file : fileList) { Path path = new Path(file); FSDataInputStream inputStream = fs.open(path); // Add stream to the VerticaCopyStream stream.addStream(inputStream); // call execute() to load the newly added stream. You could // add many streams and call execute once to load them all. // Which method you choose depends mainly on whether you want // the ability to check the number of rejections as the load // progresses so you can stop if the number of rejects gets too // high. Also, high numbers of InputStreams could create a // resource issue on your client system. stream.execute(); // Show any rejects from this execution of the stream load // getRejects() returns a List containing the // row numbers of rejected rows. List<Long> rejects = stream.getRejects(); // The size of the list gives you the number of rejected rows. int numRejects = rejects.size(); totalRejects += numRejects; if (config.autoCommit.equalsIgnoreCase("true")) { // Commit the loaded data connection.commit(); } } // Finish closes the COPY command. It returns the number of // rows inserted. long results = stream.finish(); context.getMetrics().gauge("num.of.rows.rejected", totalRejects); context.getMetrics().gauge("num.of.rows.inserted", results); // Commit the loaded data connection.commit(); } } catch (Exception e) { throw new RuntimeException(String.format("Exception while running copy statement %s", copyStatement), e); } finally { DriverManager.deregisterDriver((Driver) driver); } }
From source file:co.cask.tigon.io.Locations.java
License:Apache License
/** * Creates a new {@link com.google.common.io.InputSupplier} that can provides {@link SeekableInputStream} of * the given path.//from w w w .java2 s. c o m * * @param fs The {@link org.apache.hadoop.fs.FileSystem} for the given path. * @param path The path to create {@link co.cask.tigon.io.SeekableInputStream} when requested. * @return A {@link com.google.common.io.InputSupplier}. */ public static InputSupplier<? extends SeekableInputStream> newInputSupplier(final FileSystem fs, final Path path) { return new InputSupplier<SeekableInputStream>() { @Override public SeekableInputStream getInput() throws IOException { return SeekableInputStream.create(fs.open(path)); } }; }
From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java
License:Apache License
/** * //from w ww.j a va 2 s .co m * @param delimiter * @param column * * */ @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.delimiter = job.get(DelimitedTextInputFormat.DELIMITER_CONF); this.column = job.getInt(DelimitedTextInputFormat.COLUMN_CONF, 0); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }