List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Add a {@link Path} to the list of inputs for the map-reduce job. * //from w w w.j a v a 2s . c o m * @param job The {@link Job} to modify * @param path {@link Path} to be added to the list of inputs for * the map-reduce job. */ public static void addInputPath(Job job, Path path) throws IOException { Configuration conf = job.getConfiguration(); path = path.getFileSystem(conf).makeQualified(path); String dirStr = StringUtils.escapeString(path.toString()); String dirs = conf.get(INPUT_DIR); conf.set(INPUT_DIR, dirs == null ? dirStr : dirs + "," + dirStr); }
From source file:com.dinglicom.clouder.mapreduce.input.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; System.out.println("-------------------length:" + split.getLength() + "\tposition:" + split.getStart()); Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();// www .ja v a 2s . co m end = start + split.getLength(); final Path file = split.getPath(); key = new Text(FileToCDRType.getTypeByPath(file.getName())); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java
License:Open Source License
public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException { LOG.warn("split start: " + split.getStart()); LOG.warn("split length: " + split.getLength()); String[] locs = split.getLocations(); for (String loc : locs) { LOG.warn("location: " + loc); }// ww w . jav a 2s .c o m start = split.getStart(); end = start + split.getLength(); LOG.warn("split end: " + end); final Path file = split.getPath(); LOG.warn("file: " + file.getName()); LOG.warn("INT split start: " + (int) split.getStart()); LOG.warn("INT split length: " + (int) split.getLength()); LOG.warn("INT split end: " + (int) end); FileSystem fs = file.getFileSystem(conf); codecFactory = new CompressionCodecFactory(conf); final CompressionCodec codec = codecFactory.getCodec(file); LOG.warn("codec: " + codec.toString()); LOG.warn("config: " + conf.toString()); if (codec == null) { throw new IOException("No LZO codec found, cannot run."); } // Open the file and seek to the next split. fileIn = fs.open(file); // Create input stream and read the file header. in = new LineReader(codec.createInputStream(fileIn), conf); if (start != 0) { fileIn.seek(start); LOG.warn("fileIn position: " + fileIn.getPos()); LOG.warn("buffer size: " + conf.get("io.file.buffer.size")); // Read and ignore the first line. in.readLine(new Text()); start = fileIn.getPos(); } pos = start; }
From source file:com.ebay.erl.mobius.core.criterion.TupleRestrictions.java
License:Apache License
/** * Create a tuple criterion that only accepts tuples when the value * of the <code>column</code> are presented in the given <code>file</code> * <p>// ww w . ja v a2s .com * * The assumption of the file is that, it's single column and one to many * line text file. Each line is read into a case insensitive set, and * using the set to check the value of the <code>column</code> within * the set or not. * * * @param column the name of a column to be tested that whether its value is in * the given <code>file</code> or not * * @param file a single column and multiple lines of file that contains strings/numbers, * each line is treated as a single unit. * * @return an instance of {@link TupleCriterion} that extracts only the records * when the value of its <code>column</code> are presented in the given * <code>file</code>. * * @throws FileNotFoundException if the given file cannot be found. */ public static TupleCriterion within(final String column, File file) throws FileNotFoundException { final File f = TupleRestrictions.checkFileExist(file); return new TupleCriterion() { private static final long serialVersionUID = -1121221619118915652L; private Set<String> set; @Override public void setConf(Configuration conf) { try { if (conf.get("tmpfiles") == null || conf.get("tmpfiles").trim().length() == 0) { conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf)); } else { conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf) + "," + conf.get("tmpfiles")); } } catch (IOException e) { throw new IllegalArgumentException(e); } } /** * COPIED FROM org.apache.hadoop.util.GenericOptionsParser */ private String validateFiles(String files, Configuration conf) throws IOException { if (files == null) return null; String[] fileArr = files.split(","); String[] finalArr = new String[fileArr.length]; for (int i = 0; i < fileArr.length; i++) { String tmp = fileArr[i]; String finalPath; Path path = new Path(tmp); URI pathURI = path.toUri(); FileSystem localFs = FileSystem.getLocal(conf); if (pathURI.getScheme() == null) { // default to the local file system // check if the file exists or not first if (!localFs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(localFs).toString(); } else { // check if the file exists in this file system // we need to recreate this filesystem object to copy // these files to the file system jobtracker is running // on. FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(fs).toString(); try { fs.close(); } catch (IOException e) { } ; } finalArr[i] = finalPath; } return StringUtils.arrayToString(finalArr); } @Override protected boolean evaluate(Tuple tuple, Configuration configuration) { if (set == null) { set = new CaseInsensitiveTreeSet(); BufferedReader br = null; try { br = new BufferedReader(new FileReader(new File(f.getName()))); String newLine = null; while ((newLine = br.readLine()) != null) { this.set.add(newLine); } } catch (IOException e) { throw new RuntimeException(e); } finally { try { br.close(); } catch (Throwable e) { } } } String value = tuple.getString(column); if (value != null) { return this.set.contains(value); } else { return false; } } @Override public String[] getInvolvedColumns() { return new String[] { column }; } }; }
From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java
License:Apache License
/** * Read in the partition file and build indexing data structures. * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and * <tt>total.order.partitioner.natural.order</tt> is not false, a trie * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes * will be built. Otherwise, keys will be located using a binary search of * the partition keyset using the {@link org.apache.hadoop.io.RawComparator} * defined for this job. The input file must be sorted with the same * comparator and contain {@link/*from w w w .j a va2 s . c om*/ org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys. */ @SuppressWarnings("unchecked") // keytype from conf not static public void configure(JobConf job) { try { String parts = getPartitionFile(job); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(job) // assume in DistributedCache : partFile.getFileSystem(job); //Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass(); K[] splitPoints = readPartitions(fs, partFile, (Class<K>) Tuple.class, job); if (splitPoints.length != job.getNumReduceTasks() - 1) { throw new IOException("Wrong number of partitions in keyset"); } RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); for (int i = 0; i < splitPoints.length - 1; ++i) { if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) { throw new IOException("Split points are out of order"); } } boolean natOrder = job.getBoolean("total.order.partitioner.natural.order", true); if (natOrder && BinaryComparable.class.isAssignableFrom(Tuple.class)) { partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0], job.getInt("total.order.partitioner.max.trie.depth", 2)); } else { partitions = new BinarySearchNode(splitPoints, comparator); } } catch (IOException e) { throw new IllegalArgumentException("Can't read partitions file", e); } }
From source file:com.ebay.nest.io.nestfile.FileDump.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); for (String filename : args) { System.out.println("Structure for " + filename); Path path = new Path(filename); Reader reader = OrcFile.createReader(path.getFileSystem(conf), path); RecordReaderImpl rows = (RecordReaderImpl) reader.rows(null); System.out.println("Rows: " + reader.getNumberOfRows()); System.out.println("Compression: " + reader.getCompression()); if (reader.getCompression() != CompressionKind.NONE) { System.out.println("Compression size: " + reader.getCompressionSize()); }/*www .j ava2s .com*/ System.out.println("Type: " + reader.getObjectInspector().getTypeName()); ColumnStatistics[] stats = reader.getStatistics(); System.out.println("\nStatistics:"); for (int i = 0; i < stats.length; ++i) { System.out.println(" Column " + i + ": " + stats[i].toString()); } System.out.println("\nStripes:"); for (StripeInformation stripe : reader.getStripes()) { long stripeStart = stripe.getOffset(); System.out.println(" Stripe: " + stripe.toString()); OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); long sectionStart = stripeStart; for (OrcProto.Stream section : footer.getStreamsList()) { System.out.println(" Stream: column " + section.getColumn() + " section " + section.getKind() + " start: " + sectionStart + " length " + section.getLength()); sectionStart += section.getLength(); } for (int i = 0; i < footer.getColumnsCount(); ++i) { OrcProto.ColumnEncoding encoding = footer.getColumns(i); StringBuilder buf = new StringBuilder(); buf.append(" Encoding column "); buf.append(i); buf.append(": "); buf.append(encoding.getKind()); if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY) { buf.append("["); buf.append(encoding.getDictionarySize()); buf.append("]"); } System.out.println(buf); } } } }
From source file:com.ebay.nest.io.nestfile.OrcFile.java
License:Apache License
public static Writer createWriter(Path path, WriterOptions opts) throws IOException { FileSystem fs = opts.fileSystemValue == null ? path.getFileSystem(opts.configuration) : opts.fileSystemValue;/*from w w w.j a va2s .c o m*/ return new WriterImpl(fs, path, opts.configuration, opts.inspectorValue, opts.stripeSizeValue, opts.compressValue, opts.bufferSizeValue, opts.rowIndexStrideValue, opts.memoryManagerValue, opts.blockPaddingValue, opts.versionValue); }
From source file:com.edwardsit.spark4n6.EWFFileReaderTest.java
License:Apache License
@Test public void testGetEWFSection() throws IOException { log.setLevel(Level.DEBUG);/*from w w w . ja v a2 s . co m*/ Logger.getLogger("com.edwardsit.spark4n6") .addAppender(new RollingFileAppender(new PatternLayout(), "debug.log")); Configuration conf = new Configuration(false); Path path = new Path("../macwd.E01"); // Path path = new Path("D:\\Users\\Derek\\Images\\500GB\\500GB-CDrive.E01"); FileSystem fs = path.getFileSystem(conf); EWFFileReader reader = new EWFFileReader(fs, path); long size = reader.getImageSize(); ArrayList<EWFSection.SectionPrefix> sections = reader.getSectionPrefixArray(); Iterator<EWFSection.SectionPrefix> it = sections.iterator(); EWFSection.SectionPrefix sp; long numSplits = 10L; long priorStart = 0L; long priorEnd = 0L; Path priorFile = null; log.debug(path.getName() + ": imageSize = " + size); log.debug("File\t\tChunkIndex\t\tSectionType\t\tChunkCount\t\tSectionSize"); while (it.hasNext()) { sp = it.next(); assertNotNull(sp); log.debug(sp.file + "\t\t" + sp.chunkIndex + "\t\t" + sp.sectionType + "\t\t" + sp.chunkCount + "\t\t" + sp.sectionSize); if (!sp.file.equals(priorFile) && sp.sectionType.equals(EWFSection.SectionType.TABLE_TYPE)) { if (priorFile != null) { priorEnd = sp.chunkIndex; // log.debug(priorFile + "Split#" + (numSplits * priorEnd * 64 * 512 / size) + ", " + priorStart + " to " + priorEnd); } priorFile = sp.file; priorStart = sp.chunkIndex; } } // log.debug(priorFile + " Split#" + (numSplits * priorEnd * 64 * 512 / size) + ", " + priorStart + " to " + size / 64 / 512); }
From source file:com.edwardsit.spark4n6.EWFImageInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { log.setLevel(Level.DEBUG);/* w w w . j a va2s . co m*/ List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); BlockLocation[] blkLocations = null; Path path = null; FileSystem fs = null; EWFFileReader ewf = null; ArrayList<EWFSection.SectionPrefix> sections = null; Iterator<EWFSection.SectionPrefix> it = null; EWFSection.SectionPrefix sp = null; Path priorFile = null; long priorOffset = 0L; FileStatus priorFileStatus = null; chunkSize = new EWFSegmentFileReader(fs).DEFAULT_CHUNK_SIZE; long priorStart = 0L; int blkIndex = 0; for (FileStatus file : files) { path = file.getPath(); fs = path.getFileSystem(job.getConfiguration()); if (path.getName().endsWith(".E01")) { ewf = new EWFFileReader(fs, path); sections = ewf.getSectionPrefixArray(); it = sections.iterator(); while (it.hasNext()) { sp = it.next(); if (sp.sectionType.equals(EWFSection.SectionType.TABLE_TYPE)) { priorFileStatus = fs.getFileStatus(priorFile); for (long i = sp.chunkCount; i > 0L; i = i - getChunksPerSplit(priorFileStatus)) { if (priorFileStatus instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) priorFileStatus).getBlockLocations(); } else { blkLocations = fs.getFileBlockLocations(priorFileStatus, priorOffset, (getChunksPerSplit(priorFileStatus) * chunkSize)); } blkIndex = getBlockIndex(blkLocations, priorOffset); if (i > getChunksPerSplit(priorFileStatus)) { log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize) + ", " + (getChunksPerSplit(priorFileStatus) * chunkSize) + ", " + listHosts(blkLocations, blkIndex) + ");"); splits.add(makeSplit(priorFile, (priorStart * chunkSize), (getChunksPerSplit(priorFileStatus) * chunkSize), blkLocations[blkIndex].getHosts())); priorStart += getChunksPerSplit(priorFileStatus); } else { log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize) + ", " + (i * chunkSize) + ", " + listHosts(blkLocations, blkIndex) + ");"); splits.add(makeSplit(priorFile, (priorStart * chunkSize), (i * chunkSize), blkLocations[blkIndex].getHosts())); priorStart += i; } } } priorFile = sp.file; priorOffset = sp.fileOffset; } } } return splits; }
From source file:com.elex.dmp.core.TopicModel.java
License:Apache License
public void persist(Path outputDir, boolean overwrite) throws IOException { FileSystem fs = outputDir.getFileSystem(conf); if (overwrite) { fs.delete(outputDir, true); // CHECK second arg }//from ww w.ja va2 s .c om DistributedRowMatrixWriter.write(outputDir, conf, topicTermCounts); }