List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException
From source file:PageInputFormat.java
License:Apache License
public InputSplit[] getSplits(JobConf job, int num) throws IOException { long minSize = 1; long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); FileStatus[] files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { BlockLocation[] blkLocations; FileSystem fs = path.getFileSystem(job); blkLocations = fs.getFileBlockLocations(file, 0, length); if (isSplitable(path.getFileSystem(job), path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; }//from ww w.j a v a 2 s. c o m if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts())); } } else splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts())); } else splits.add(makeSplit(path, 0, length, new String[0])); } // Save the number of input files for metrics/loadgen job.setLong(NUM_INPUT_FILES, files.length); return splits.toArray(new InputSplit[0]); }
From source file:StreamWikiDumpInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); List<InputSplit> splits = new ArrayList<InputSplit>(); Path path = file.getPath();//ww w . j a v a 2s. com long length = file.getLen(); FileSystem fs = file.getPath().getFileSystem(job); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long bytesRemaining = length; SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs); InputStream is = null; long start = 0; long skip = 0; if (is != null) { // start = is.getAdjustedStart(); // length = is.getAdjustedEnd(); is.close(); in = null; } LOG.info("locations=" + Arrays.asList(blkLocations)); FileSplit split = null; Set<Long> processedPageEnds = new HashSet<Long>(); float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F); READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) { // prepare matcher ByteMatcher matcher; { long st = Math.min(start + skip + splitSize, length - 1); split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations); System.err.println("split move to: " + split); if (in != null) in.close(); if (split.getLength() <= 1) { break; } in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs); // SplitCompressionInputStream cin = // in.getSplitCompressionInputStream(); } matcher = new ByteMatcher(in); // read until the next page end in the look-ahead split boolean reach = false; while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) { if (matcher.getPos() >= length || split.getLength() == length - split.getStart()) break READLOOP; reach = false; split = makeSplit(path, split.getStart(), Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap, blkLocations); System.err.println("split extend to: " + split); } System.err.println( path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos() + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes() + " current=" + start + " remaining=" + bytesRemaining + " split=" + split); if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos() && !processedPageEnds.contains(matcher.getPos())) { splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations)); processedPageEnds.add(matcher.getPos()); long newstart = Math.max(matcher.getLastUnmatchPos(), start); bytesRemaining = length - newstart; start = newstart; skip = 0; } else { skip = matcher.getPos() - start; } } if (bytesRemaining > 0 && !processedPageEnds.contains(length)) { System.err.println( pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } if (in != null) in.close(); } else if (length != 0) { splits.add(makeSplit(path, 0, length, clusterMap, blkLocations)); } else { // Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } return splits; }
From source file:DupleInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context// ww w . j av a 2 s .c o m * @throws IOException */ public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); // times that each file exists in the files List ArrayList<Integer> times = new ArrayList<Integer>(); ArrayList<Path> paths = new ArrayList<Path>(); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { FileSystem fs = path.getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); int index; if ((index = paths.indexOf(path)) != -1) times.set(index, times.get(index) + 1); else { times.add(0); paths.add(path); index = times.size() - 1; } // not splitable splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), times.get(index))); } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); //LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:HDFSFileFinder.java
License:Apache License
private static void getBlockLocationsFromHdfs() { StringBuilder sb = new StringBuilder(); Configuration conf = new Configuration(); boolean first = true; // make connection to hdfs try {//from www . j a v a 2 s . c o m if (verbose) { writer.println("DEBUG: Trying to connect to " + fsName); } FileSystem fs = FileSystem.get(conf); Path file = new Path(fileName); FileStatus fStatus = fs.getFileStatus(file); status = fStatus; bLocations = fs.getFileBlockLocations(status, 0, status.getLen()); //print out all block locations for (BlockLocation aLocation : bLocations) { String[] names = aLocation.getHosts(); for (String name : names) { InetAddress addr = InetAddress.getByName(name); String host = addr.getHostName(); int idx = host.indexOf('.'); String hostname; if (0 < idx) { hostname = host.substring(0, host.indexOf('.')); } else { hostname = host; } if (first) { sb.append(hostname); first = false; } else { sb.append(",").append(hostname); } } } sb.append(NEWLINE); } catch (IOException e) { writer.println("Error getting block location data from namenode"); e.printStackTrace(); } writer.print(sb.toString()); writer.flush(); }
From source file:a.TestConcatExample.java
License:Apache License
@Test public void concatIsPermissive() throws IOException, URISyntaxException { MiniDFSCluster cluster = null;/*from w w w .j a va 2 s.com*/ final Configuration conf = WebHdfsTestUtil.createConf(); conf.set("dfs.namenode.fs-limits.min-block-size", "1000"); // Allow tiny blocks for the test try { cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).build(); cluster.waitActive(); final FileSystem webHdfs = WebHdfsTestUtil.getWebHdfsFileSystem(conf, WebHdfsFileSystem.SCHEME); final FileSystem dfs = cluster.getFileSystem(); final FileSystem fs = dfs; // WebHDFS has a bug in getLocatedBlocks Path root = new Path("/dir"); fs.mkdirs(root); short origRep = 3; short secondRep = (short) (origRep - 1); Path f1 = new Path("/dir/f1"); long size1 = writeFile(fs, f1, /* blocksize */ 4096, origRep, 5); long f1NumBlocks = fs.getFileBlockLocations(f1, 0, size1).length; assertEquals(5, f1NumBlocks); Path f2 = new Path("/dir/f2"); long size2 = writeFile(fs, f2, /* blocksize (must divide 512 for checksum) */ 4096 - 512, secondRep, 4); long f2NumBlocks = fs.getFileBlockLocations(f2, 0, size2).length; assertEquals(5, f2NumBlocks); fs.concat(f1, new Path[] { f2 }); FileStatus[] fileStatuses = fs.listStatus(root); // Only one file should remain assertEquals(1, fileStatuses.length); FileStatus fileStatus = fileStatuses[0]; // And it should be named after the first file assertEquals("f1", fileStatus.getPath().getName()); // The entire file takes the replication of the first argument assertEquals(origRep, fileStatus.getReplication()); // As expected, the new concated file is the length of both the previous files assertEquals(size1 + size2, fileStatus.getLen()); // And we should have the same number of blocks assertEquals(f1NumBlocks + f2NumBlocks, fs.getFileBlockLocations(fileStatus.getPath(), 0, size1 + size2).length); } finally { if (cluster != null) { cluster.shutdown(); } } }
From source file:ca.sparkera.adapters.mapred.MainframeVBInputFormat.java
License:Apache License
/** * Splits files returned by {@link #listStatus(JobConf)} when they're too * big.// www .j av a 2 s . c o m */ @Override @SuppressWarnings("deprecation") public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { FileStatus[] files = listStatus(job); for (FileStatus file : files) { // check we have valid files if (file.isDir()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize); // generate splits ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); FSDataInputStream fileIn; InputStream inputStream; fileIn = fs.open(path); inputStream = fileIn; filePosition = fileIn; long offset = 0; long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long bytesRemaining = length; long splitSize = 0; while (offset < length) { splitSize = computeSplitSize(goalSize, minSize, blockSize, inputStream); int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; offset = length - bytesRemaining; } if (bytesRemaining != 0) { throw new IOException( "Partial record(length = " + bytesRemaining + ") found at the end of file " + path); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } if (inputStream != null) { inputStream.close(); inputStream = null; } } java.util.Date date = new java.util.Date(); System.out.println((new Timestamp(date.getTime())) + ",\t Split = 100% Total Splits - " + (++splitCount) + "\t Total Records in VB file - " + totalRecords); LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:co.cask.cdap.data.stream.StreamDataFileSplitter.java
License:Apache License
/** * Computes splits for the event file.//from w ww. j a v a2 s . c om */ <T> void computeSplits(FileSystem fs, long minSplitSize, long maxSplitSize, long startTime, long endTime, List<T> splits, StreamInputSplitFactory<T> splitFactory) throws IOException { // Compute the splits based on the min/max size Path eventFile = eventFileStatus.getPath(); Path indexFile = getIndexFile(eventFile); BlockLocation[] blockLocations = fs.getFileBlockLocations(eventFile, 0, eventFileStatus.getLen()); long length = eventFileStatus.getLen(); long offset = 0; int blockIndex = 0; while (offset < length) { blockIndex = getBlockIndex(blockLocations, offset, blockIndex); String[] hosts = null; if (blockIndex >= 0) { hosts = blockLocations[blockIndex].getHosts(); } else { blockIndex = 0; } long splitSize = computeSplitSize(eventFileStatus, offset, minSplitSize, maxSplitSize); splits.add( splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, splitSize, hosts)); offset += splitSize; } // One extra split for the tail of the file. splits.add( splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, Long.MAX_VALUE, null)); }
From source file:com.asakusafw.runtime.directio.hadoop.BlockMap.java
License:Apache License
/** * Returns a list of {@link BlockInfo} for the target file. * @param fs the target file//from w w w .ja va 2 s. c om * @param status the target file status * @return the computed information * @throws IOException if failed to compute information */ public static List<BlockInfo> computeBlocks(FileSystem fs, FileStatus status) throws IOException { BlockLocation[] locations = fs.getFileBlockLocations(status, 0, status.getLen()); List<BlockInfo> results = new ArrayList<>(); for (BlockLocation location : locations) { long length = location.getLength(); long start = location.getOffset(); results.add(new BlockInfo(start, start + length, location.getHosts())); } return results; }
From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java
License:Apache License
/** Splits files returned by {@link #listStatus(JobConf)} when * they're too big.*///from w ww. j a v a 2s. c o m @SuppressWarnings("deprecation") public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { LOG.warn("test go go go"); FileStatus[] files = listStatus(job); // Save the number of input files in the job-conf job.setLong(NUM_INPUT_FILES, files.length); long totalSize = 0; // compute total size for (FileStatus file : files) { // check we have valid files if (file.isDir()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize); // generate splits ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); NetworkTopology clusterMap = new NetworkTopology(); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts)); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap); splits.add(new FileSplit(path, 0, length, splitHosts)); } else { //Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:com.chinamobile.bcbsp.io.BSPFileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * * @param job/* w ww . j av a 2 s .c om*/ * The current BSPJob job * @return input splits */ @Override public List<InputSplit> getSplits(BSPJob job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConf()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = 0L; if (job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1) == 1) { if (job.getSplitSize() == 0L) { splitSize = blockSize; } else { splitSize = job.getSplitSize(); } } else { if (job.getSplitSize() == 0L) { splitSize = blockSize * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1); } else { splitSize = job.getSplitSize() * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1); } } LOG.info("[Split Size] " + (splitSize / (1024 * 1024)) + " MB"); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } LOG.info("[Split Number] " + splits.size()); return splits; }