List of usage examples for org.apache.hadoop.fs FileSystem getConf
@Override
public Configuration getConf()
From source file:com.hadoop.compression.lzo.LzoIndex.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs./*from ww w. j ava 2 s . c o m*/ * * @param fs File system that contains the file. * @param lzoFile the lzo file to index. For filename.lzo, the created index file will be * filename.lzo.index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(lzoFile); if (null == codec) { throw new IOException("Could not find codec for file " + lzoFile + " - you may need to add the LZO codec to your io.compression.codecs " + "configuration in core-site.xml"); } ((Configurable) codec).setConf(conf); FSDataInputStream is = null; FSDataOutputStream os = null; Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX); Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX); // Track whether an exception was thrown or not, so we know to either // delete the tmp index file on failure, or rename it to the new index file on success. boolean indexingSucceeded = false; try { is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // Solely for reading the header codec.createInputStream(is, decompressor); int numCompressedChecksums = decompressor.getCompressedChecksumsCount(); int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } // See LzopInputStream.getCompressedData boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize); int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums : numDecompressedChecksums + numCompressedChecksums; long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip)); } // If we're here, indexing was successful. indexingSucceeded = true; } finally { // Close any open streams. if (is != null) { is.close(); } if (os != null) { os.close(); } if (!indexingSucceeded) { // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves. fs.delete(tmpOutputFile, false); } else { // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index. fs.rename(tmpOutputFile, outputFile); } } }
From source file:com.hadoop.mapreduce.LzoTextInputFormat.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs./*from w w w . j a va 2 s . c o m*/ * * @param fs * File system that contains the file. * @param lzoFile * the lzo file to index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf()); CompressionCodec codec = factory.getCodec(lzoFile); ((Configurable) codec).setConf(conf); InputStream lzoIs = null; FSDataOutputStream os = null; Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX); Path tmpOutputFile = outputFile.suffix(".tmp"); try { FSDataInputStream is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // for reading the header lzoIs = codec.createInputStream(is, decompressor); int numChecksums = decompressor.getChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksums)); } } finally { if (lzoIs != null) { lzoIs.close(); } if (os != null) { os.close(); } } fs.rename(tmpOutputFile, outputFile); }
From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java
License:Apache License
/** * Copies data from the given input stream to an HDFS file at the given path. This method will close the input stream. *//* w ww.j a v a 2s. c o m*/ protected final void copyStreamToHdfs(InputStream resource, String hdfsDestFileName) throws IOException { FileSystem fs = getFileSystem(); FSDataOutputStream os = fs.create(new Path(hdfsDestFileName), false); IOUtils.copyBytes(resource, os, fs.getConf(), true); }
From source file:com.lithium.flow.filer.HdfsFiler.java
License:Apache License
public HdfsFiler(@Nonnull FileSystem fileSystem) { this.fileSystem = checkNotNull(fileSystem); overwrite = fileSystem.getConf().getBoolean("overwrite", true); }
From source file:com.mellanox.r4h.TestReadWhileWriting.java
License:Apache License
/** Test reading while writing. */ @Test/*from www . j ava 2s . c o m*/ public void pipeline_02_03() throws Exception { final Configuration conf = new HdfsConfiguration(); conf.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1); // create cluster final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(4).build(); try { //change the lease limits. cluster.setLeasePeriod(SOFT_LEASE_LIMIT, HARD_LEASE_LIMIT); //wait for the cluster cluster.waitActive(); final FileSystem fs = cluster.getFileSystem(); final Path p = new Path(DIR, "file1"); final int half = BLOCK_SIZE / 2; //a. On Machine M1, Create file. Write half block of data. // Invoke DFSOutputStream.hflush() on the dfs file handle. // Do not close file yet. { final FSDataOutputStream out = fs.create(p, true, fs.getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), (short) 3, BLOCK_SIZE); write(out, 0, half); //hflush ((DFSOutputStream) out.getWrappedStream()).hflush(); } //b. On another machine M2, open file and verify that the half-block // of data can be read successfully. checkFile(p, half, conf); MiniDFSClusterBridge.getAppendTestUtilLOG().info("leasechecker.interruptAndJoin()"); ((DistributedFileSystem) fs).dfs.getLeaseRenewer().interruptAndJoin(); //c. On M1, append another half block of data. Close file on M1. { //sleep to let the lease is expired. Thread.sleep(2 * SOFT_LEASE_LIMIT); final UserGroupInformation current = UserGroupInformation.getCurrentUser(); final UserGroupInformation ugi = UserGroupInformation .createUserForTesting(current.getShortUserName() + "x", new String[] { "supergroup" }); final DistributedFileSystem dfs = ugi.doAs(new PrivilegedExceptionAction<DistributedFileSystem>() { @Override public DistributedFileSystem run() throws Exception { return (DistributedFileSystem) FileSystem.newInstance(conf); } }); final FSDataOutputStream out = append(dfs, p); write(out, 0, half); out.close(); } //d. On M2, open file and read 1 block of data from it. Close file. checkFile(p, 2 * half, conf); } finally { cluster.shutdown(); } }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
/** * Configure this instance based on the command-line arguments contained within provided array. * Calls {@link #validate()} to ensure consistency of configuration. * * @return true if the arguments were parsed successfully and execution should proceed. * @throws Exception if there is a problem parsing the command-line arguments or the particular * combination would violate class invariants. *///from w ww . j a v a2 s. co m private boolean parseArgs(String[] args) throws Exception { addInputOption(); addOption("trainingOutput", "tr", "The training data output directory", false); addOption("testOutput", "te", "The test data output directory", false); addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false); addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false); addOption("splitLocation", "sl", "Location for start of test data expressed as a percentage of the input file " + "size (0=start, 50=middle, 100=end", false); addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false); addOption("randomSelectionPct", "rp", "Percentage of items to be randomly selected as test data when using " + "mapreduce mode", false); addOption("charset", "c", "The name of the character encoding of the input files (not needed if using " + "SequenceFiles)", false); addOption(buildOption("sequenceFiles", "seq", "Set if the input files are sequence files. Default is false", false, false, "false")); addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); //TODO: extend this to sequential mode addOption("keepPct", "k", "The percentage of total data to keep in map-reduce mode, the rest will be ignored. " + "Default is 100%", false); addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false); if (parseArguments(args) == null) { return false; } try { inputDirectory = getInputPath(); useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD); if (useMapRed) { if (!hasOption("randomSelectionPct")) { throw new OptionException(getCLIOption("randomSelectionPct"), "must set randomSelectionPct when mapRed option is used"); } if (!hasOption("mapRedOutputDir")) { throw new OptionException(getCLIOption("mapRedOutputDir"), "mapRedOutputDir must be set when mapRed option is used"); } mapRedOutputDirectory = new Path(getOption("mapRedOutputDir")); if (hasOption("keepPct")) { keepPct = Integer.parseInt(getOption("keepPct")); } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), mapRedOutputDirectory); } } else { if (!hasOption("trainingOutput") || !hasOption("testOutput")) { throw new OptionException(getCLIOption("trainingOutput"), "trainingOutput and testOutput must be set if mapRed option is not used"); } if (!hasOption("testSplitSize") && !hasOption("testSplitPct") && !hasOption("randomSelectionPct") && !hasOption("randomSelectionSize")) { throw new OptionException(getCLIOption("testSplitSize"), "must set one of test split size/percentage or randomSelectionSize/percentage"); } trainingOutputDirectory = new Path(getOption("trainingOutput")); testOutputDirectory = new Path(getOption("testOutput")); FileSystem fs = trainingOutputDirectory.getFileSystem(getConf()); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(fs.getConf(), trainingOutputDirectory); HadoopUtil.delete(fs.getConf(), testOutputDirectory); } fs.mkdirs(trainingOutputDirectory); fs.mkdirs(testOutputDirectory); } if (hasOption("charset")) { charset = Charset.forName(getOption("charset")); } if (hasOption("testSplitSize") && hasOption("testSplitPct")) { throw new OptionException(getCLIOption("testSplitPct"), "must have either split size or split percentage " + "option, not BOTH"); } if (hasOption("testSplitSize")) { setTestSplitSize(Integer.parseInt(getOption("testSplitSize"))); } if (hasOption("testSplitPct")) { setTestSplitPct(Integer.parseInt(getOption("testSplitPct"))); } if (hasOption("splitLocation")) { setSplitLocation(Integer.parseInt(getOption("splitLocation"))); } if (hasOption("randomSelectionSize")) { setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize"))); } if (hasOption("randomSelectionPct")) { setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct"))); } useSequence = hasOption("sequenceFiles"); } catch (OptionException e) { log.error("Command-line option Exception", e); CommandLineUtil.printHelp(getGroup()); return false; } validate(); return true; }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
/** * Perform a split on the specified input file. Results will be written to files of the same name in the specified * training and test output directories. The {@link #validate()} method is called prior to executing the split. */// w w w . j a v a 2s .c o m public void splitFile(Path inputFile) throws IOException { Configuration conf = getConf(); FileSystem fs = inputFile.getFileSystem(conf); if (fs.getFileStatus(inputFile) == null) { throw new IOException(inputFile + " does not exist"); } if (fs.getFileStatus(inputFile).isDir()) { throw new IOException(inputFile + " is a directory"); } validate(); Path testOutputFile = new Path(testOutputDirectory, inputFile.getName()); Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName()); int lineCount = countLines(fs, inputFile, charset); log.info("{} has {} lines", inputFile.getName(), lineCount); int testSplitStart = 0; int testSplitSize = this.testSplitSize; // don't modify state BitSet randomSel = null; if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) { testSplitSize = this.testRandomSelectionSize; if (testRandomSelectionPct > 0) { testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f); } log.info("{} test split size is {} based on random selection percentage {}", inputFile.getName(), testSplitSize, testRandomSelectionPct); long[] ridx = new long[testSplitSize]; RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom()); randomSel = new BitSet(lineCount); for (long idx : ridx) { randomSel.set((int) idx + 1); } } else { if (testSplitPct > 0) { // calculate split size based on percentage testSplitSize = Math.round(lineCount * testSplitPct / 100.0f); log.info("{} test split size is {} based on percentage {}", inputFile.getName(), testSplitSize, testSplitPct); } else { log.info("{} test split size is {}", inputFile.getName(), testSplitSize); } if (splitLocation > 0) { // calculate start of split based on percentage testSplitStart = Math.round(lineCount * splitLocation / 100.0f); if (lineCount - testSplitStart < testSplitSize) { // adjust split start downwards based on split size. testSplitStart = lineCount - testSplitSize; } log.info("{} test split start is {} based on split location {}", inputFile.getName(), testSplitStart, splitLocation); } if (testSplitStart < 0) { throw new IllegalArgumentException( "test split size for " + inputFile + " is too large, it would produce an " + "empty training set from the initial set of " + lineCount + " examples"); } else if (lineCount - testSplitSize < testSplitSize) { log.warn( "Test set size for {} may be too large, {} is larger than the number of " + "lines remaining in the training set: {}", inputFile, testSplitSize, lineCount - testSplitSize); } } int trainCount = 0; int testCount = 0; if (!useSequence) { BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset); try { String line; int pos = 0; while ((line = reader.readLine()) != null) { pos++; Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } writer.write(line); writer.write('\n'); } } finally { Closeables.close(reader, true); Closeables.close(trainingWriter, false); Closeables.close(testWriter, false); } } else { SequenceFileIterator<Writable, Writable> iterator = new SequenceFileIterator<Writable, Writable>( inputFile, false, fs.getConf()); SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile, iterator.getKeyClass(), iterator.getValueClass()); SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile, iterator.getKeyClass(), iterator.getValueClass()); try { int pos = 0; while (iterator.hasNext()) { pos++; SequenceFile.Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } Pair<Writable, Writable> pair = iterator.next(); writer.append(pair.getFirst(), pair.getSecond()); } } finally { Closeables.close(iterator, true); Closeables.close(trainingWriter, false); Closeables.close(testWriter, false); } } log.info("file: {}, input: {} train: {}, test: {} starting at {}", inputFile.getName(), lineCount, trainCount, testCount, testSplitStart); // testing; if (callback != null) { callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart); } }
From source file:com.ning.metrics.action.hdfs.data.RowFileContentsIteratorFactory.java
License:Apache License
public Iterator<Row> build(final FileSystem fs, final Path path, final boolean raw) throws IOException { try {//from w w w .j a va 2 s. c o m return new RowSequenceFileContentsIterator(path.toUri().getPath(), rowParser, registrar, new SequenceFile.Reader(fs, path, fs.getConf()), raw); } catch (IOException e) { // Not a Sequence file? final FSDataInputStream input = fs.open(path); return new RowTextFileContentsIterator(path.toUri().getPath(), rowParser, registrar, input, raw); } }
From source file:com.ning.metrics.collector.healthchecks.HadoopHealthCheck.java
License:Apache License
@Override public Result check() { try {/*from w ww . j a v a 2 s . c o m*/ final FileSystem fileSystem = fsAccess.get(0); // No exponential backoff, fail early final Configuration fileSystemConf = fileSystem.getConf(); final StringBuilder builder = new StringBuilder(); for (final String prop : HADOOP_PROPERTIES) { builder.append(String.format("%s: %s, ", prop, fileSystemConf.get(prop))); } return Result.healthy(builder.toString()); } catch (Exception e) { return Result.healthy("Exception when trying to access Hadoop"); } }
From source file:com.ricemap.spateDB.operations.Repartition.java
License:Apache License
public static <S extends Shape> CellInfo[] packInPrisms(FileSystem fs, Path[] files, FileSystem outFileSystem, Path outFile, long blocksize, S stockShape) throws IOException { final Vector<Point3d> sample = new Vector<Point3d>(); double sample_ratio = outFileSystem.getConf().getFloat(SpatialSite.SAMPLE_RATIO, 0.01f); long sample_size = outFileSystem.getConf().getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024); // 24 is the estimated size in bytes needed to store each sample point long sample_count = sample_size / 24; LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%"); ResultCollector<Point3d> resultCollector = new ResultCollector<Point3d>() { @Override/*from w ww . ja v a 2s.c o m*/ public void collect(Point3d value) { sample.add(value.clone()); } }; Sampler.sampleWithRatio(fs, files, sample_ratio, sample_count, System.currentTimeMillis(), resultCollector, stockShape, new Point3d()); LOG.info("Finished reading a sample of size: " + sample.size() + " records"); long inFileSize = Sampler.sizeOfLastProcessedFile; // Compute an approximate MBR to determine the desired number of rows // and columns Prism approxMBR = new Prism(Double.MAX_VALUE, Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Point3d pt : sample) { approxMBR.expand(pt); } GridInfo gridInfo = new GridInfo(approxMBR.t1, approxMBR.x1, approxMBR.y1, approxMBR.t2, approxMBR.x2, approxMBR.y2); gridInfo.calculateCellDimensions(Math.max(1, (int) ((inFileSize + blocksize / 2) / blocksize))); gridInfo.set(-Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE, Double.MAX_VALUE, Double.MAX_VALUE, Double.MAX_VALUE); Prism[] Prisms = RTree.packInPrisms(gridInfo, sample.toArray(new Point3d[sample.size()])); CellInfo[] cellsInfo = new CellInfo[Prisms.length]; for (int i = 0; i < Prisms.length; i++) cellsInfo[i] = new CellInfo(i + 1, Prisms[i]); return cellsInfo; }