List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static int getNumTerms(Configuration conf, Path dictionaryPath) throws IOException { FileSystem fs = dictionaryPath.getFileSystem(conf); Text key = new Text(); IntWritable value = new IntWritable(); int maxTermId = -1; for (FileStatus stat : fs.globStatus(dictionaryPath)) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, stat.getPath(), conf); while (reader.next(key, value)) { maxTermId = Math.max(maxTermId, value.get()); }/* w ww.j a v a 2 s .c o m*/ } return maxTermId + 1; }
From source file:com.elex.dmp.lda.InMemoryCollapsedVariationalBayes0.java
License:Apache License
private static Matrix loadVectors(String vectorPathString, Configuration conf) throws IOException { Path vectorPath = new Path(vectorPathString); FileSystem fs = vectorPath.getFileSystem(conf); List<Path> subPaths = Lists.newArrayList(); if (fs.isFile(vectorPath)) { subPaths.add(vectorPath);/* ww w. ja va 2 s .co m*/ } else { for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) { subPaths.add(fileStatus.getPath()); } } List<Vector> vectorList = Lists.newArrayList(); for (Path subPath : subPaths) { for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>( subPath, true, conf)) { vectorList.add(record.getSecond().get()); } } int numRows = vectorList.size(); int numCols = vectorList.get(0).size(); return new SparseRowMatrix(numRows, numCols, vectorList.toArray(new Vector[vectorList.size()]), true, vectorList.get(0).isSequentialAccess()); }
From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java
License:Apache License
void openFile() throws IOException { start = split.getStart();/*from w ww.ja v a 2 s . c om*/ end = start + split.getLength(); final Path file = split.getPath(); LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing=" + fileEncodeing + " " + split.getStart() + ":" + split.getLength()); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); if (file.getName().endsWith(".zip")) { LOG.info("use ZipInputStream read file " + split.getPath()); ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing)); in = new LineReader(zin, job); filePosition = fileIn; codec = new GzipCodec(); return; } if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); // tar.gzTarInputStream // new TarInputStream(codec.createInputStream(fileIn, // decompressor) String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(cIn), job); } else { in = new LineReader(cIn, job); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { String filename = file.getName(); if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } filePosition = fileIn; } } else { fileIn.seek(start); String filename = file.getName(); if (filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(fileIn), job); } else { in = new LineReader(fileIn, job); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.ery.hadoop.mrddx.hFile.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from ww w .j a v a 2 s. c o m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); // tar.gzTarInputStream // new TarInputStream(codec.createInputStream(fileIn, decompressor) String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(cIn), job); } else { in = new LineReader(cIn, job); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.facebook.hive.orc.FileDump.java
License:Open Source License
private static void processFile(String filename, Configuration conf) throws IOException { final Path path = new Path(filename); ReaderWriterProfiler.setProfilerOptions(conf); System.out.println("Structure for " + filename); final Reader reader = OrcFile.createReader(path.getFileSystem(conf), path, conf); final RecordReaderImpl rows = (RecordReaderImpl) reader.rows(null); System.out.println("Rows: " + reader.getNumberOfRows()); printCompressionInformation(reader); System.out.println("Raw data size: " + reader.getRawDataSize()); System.out.println("Type: " + reader.getObjectInspector().getTypeName()); printColumnStatistics(reader);/*from w ww . ja v a 2 s. c om*/ printStripeInformation(reader, rows); }
From source file:com.facebook.hive.orc.OrcInputFormat.java
License:Open Source License
@Override public RecordReader<NullWritable, OrcLazyRow> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { ReaderWriterProfiler.setProfilerOptions(conf); FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(conf); reporter.setStatus(fileSplit.toString()); return new OrcRecordReader(OrcFile.createReader(fs, path, conf), conf, fileSplit.getStart(), fileSplit.getLength());/*from www. ja v a2 s .c om*/ }
From source file:com.facebook.hive.orc.OrcOutputFormat.java
License:Open Source License
@Override public RecordWriter<NullWritable, OrcSerdeRow> getRecordWriter(FileSystem fileSystem, JobConf conf, String name, Progressable reporter) throws IOException { ReaderWriterProfiler.setProfilerOptions(conf); // To be compatible with older file formats like Sequence and RC // Only works if mapred.work.output.dir is set in the conf Path workOutputPath = FileOutputFormat.getWorkOutputPath(conf); Path outputPath = workOutputPath == null ? new Path(name) : new Path(workOutputPath, name); if (fileSystem == null && workOutputPath != null) { fileSystem = workOutputPath.getFileSystem(conf); }/*ww w. j a v a 2 s . c o m*/ return new OrcRecordWriter(fileSystem, outputPath, conf, OrcConf.ConfVars.HIVE_ORC_STRIPE_SIZE.defaultLongVal, OrcConf.ConfVars.HIVE_ORC_COMPRESSION.defaultVal, OrcConf.ConfVars.HIVE_ORC_COMPRESSION_BLOCK_SIZE.defaultIntVal, OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE.defaultIntVal); }
From source file:com.facebook.hive.orc.OrcOutputFormat.java
License:Open Source License
@Override public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf conf, Path path, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable reporter) throws IOException { ReaderWriterProfiler.setProfilerOptions(conf); String stripeSizeStr = tableProperties.getProperty(OrcFile.STRIPE_SIZE); long stripeSize; if (stripeSizeStr != null) { stripeSize = Long.valueOf(stripeSizeStr); } else {//from www .j av a2s . co m stripeSize = OrcConf.getLongVar(conf, OrcConf.ConfVars.HIVE_ORC_STRIPE_SIZE); } String compression = tableProperties.getProperty(OrcFile.COMPRESSION); if (compression == null) { compression = OrcConf.getVar(conf, OrcConf.ConfVars.HIVE_ORC_COMPRESSION); } String compressionSizeStr = tableProperties.getProperty(OrcFile.COMPRESSION_BLOCK_SIZE); int compressionSize; if (compressionSizeStr != null) { compressionSize = Integer.valueOf(compressionSizeStr); } else { compressionSize = OrcConf.getIntVar(conf, OrcConf.ConfVars.HIVE_ORC_COMPRESSION_BLOCK_SIZE); } String rowIndexStrideStr = tableProperties.getProperty(OrcFile.ROW_INDEX_STRIDE); int rowIndexStride; if (rowIndexStrideStr != null) { rowIndexStride = Integer.valueOf(rowIndexStrideStr); } else { rowIndexStride = OrcConf.getIntVar(conf, OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE); } String enableIndexesStr = tableProperties.getProperty(OrcFile.ENABLE_INDEXES); boolean enableIndexes; if (enableIndexesStr != null) { enableIndexes = Boolean.valueOf(enableIndexesStr); } else { enableIndexes = OrcConf.getBoolVar(conf, OrcConf.ConfVars.HIVE_ORC_CREATE_INDEX); } if (!enableIndexes) { rowIndexStride = 0; } return new OrcRecordWriter(path.getFileSystem(conf), path, conf, stripeSize, compression, compressionSize, rowIndexStride); }
From source file:com.facebook.hiveio.common.HadoopUtils.java
License:Apache License
/** * Delete output directory for this job//from w w w . j av a 2s . c o m * @param conf Configuration to use * @throws IOException I/O errors */ public static void deleteOutputDir(Configuration conf) throws IOException { Path outputPath = getOutputPath(conf); outputPath.getFileSystem(conf).delete(outputPath, true); }
From source file:com.facebook.hiveio.common.HadoopUtils.java
License:Apache License
/** * Set worker output directory/*w w w . ja va 2 s . c om*/ * @param context Task context * @throws IOException I/O errors */ public static void setWorkOutputDir(TaskAttemptContext context) throws IOException { Configuration conf = context.getConfiguration(); String outputPath = getOutputDir(conf); // we need to do this to get the task path and set it for mapred // implementation since it can't be done automatically because of // mapreduce->mapred abstraction if (outputPath != null) { FileOutputCommitter foc = new FileOutputCommitter(getOutputPath(conf), context); Path path = foc.getWorkPath(); FileSystem fs = path.getFileSystem(conf); fs.mkdirs(path); conf.set("mapred.work.output.dir", path.toString()); LOG.info("Setting mapred.work.output.dir to {}", path.toString()); } }