List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException
From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. *//*from w w w.j a va 2 s . co m*/ protected List<InputSplit> getGuaguaSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); if (isPigOrHadoopMetaFile(path)) { continue; } FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > GuaguaMapReduceConstants.SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()))); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()))); } } else if (length != 0) { splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, blkLocations[0].getHosts()))); } else { // Create empty hosts array for zero length files splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, new String[0]))); } } // Save the number of input files in the job-conf job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: {}", splits.size()); return splits; }
From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. *//*from w ww. j a v a 2 s . c o m*/ public static List<InputSplit> getFileSplits(Configuration conf, long splitSize) throws IOException { // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); FileStatus[] files = listStatus(conf); for (FileStatus file : files) { Path path = file.getPath(); if (isPigOrHadoopMetaFile(path)) { continue; } FileSystem fs = path.getFileSystem(conf); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(conf, path)) { long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: {}", splits.size()); return splits; }
From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. *//*from w w w. jav a 2 s. co m*/ public static List<InputSplit> getGuaguaSplits(Configuration conf, long splitSize) throws IOException { // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); FileStatus[] files = listStatus(conf); for (FileStatus file : files) { Path path = file.getPath(); if (isPigOrHadoopMetaFile(path)) { continue; } FileSystem fs = path.getFileSystem(conf); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(conf, path)) { long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()) })); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()) })); } } else if (length != 0) { splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, 0, length, blkLocations[0].getHosts()) })); } else { // Create empty hosts array for zero length files splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, 0, length, new String[0]) })); } } LOG.debug("Total # of splits: {}", splits.size()); return splits; }
From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. *//*ww w.java 2 s . c o m*/ protected List<InputSplit> getVarSelectSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); if (isPigOrHadoopMetaFile(path)) { continue; } FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; // here double comparison can be directly used because of no precision requirement while (((double) bytesRemaining) / splitSize > 1.1d) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()))); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()))); } } else if (length != 0) { splits.add(new CombineInputSplit(new FileSplit(path, 0, length, blkLocations[0].getHosts()))); } else { // Create empty hosts array for zero length files splits.add(new CombineInputSplit(new FileSplit(path, 0, length, new String[0]))); } } // Save the number of input files in the job-conf job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: {}", splits.size()); return splits; }
From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java
License:Apache License
private FileSplit getFileSplit(FileSystem fs, FileStatus file, long offset, long length) throws IOException { BlockLocation[] blkLocations = fs.getFileBlockLocations(file, offset, length); List<String> hosts = new ArrayList<String>(); for (BlockLocation location : blkLocations) { hosts.addAll(Arrays.asList(location.getHosts())); }// w w w. j a va 2 s . c o m String[] shosts = new String[hosts.size()]; FileSplit fsp = new FileSplit(file.getPath(), offset, length, hosts.toArray(shosts)); return fsp; }
From source file:mvm.rya.accumulo.mr.utils.AccumuloHDFSFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { //read the params from AccumuloInputFormat Configuration conf = jobContext.getConfiguration(); Instance instance = AccumuloProps.getInstance(jobContext); String user = AccumuloProps.getUsername(jobContext); AuthenticationToken password = AccumuloProps.getPassword(jobContext); String table = AccumuloProps.getTablename(jobContext); ArgumentChecker.notNull(instance);/* w w w . j a v a 2 s . com*/ ArgumentChecker.notNull(table); //find the files necessary try { AccumuloConfiguration acconf = instance.getConfiguration(); FileSystem fs = FileSystem.get(conf); Connector connector = instance.getConnector(user, password); TableOperations tos = connector.tableOperations(); String tableId = tos.tableIdMap().get(table); String filePrefix = acconf.get(Property.INSTANCE_DFS_DIR) + "/tables/" + tableId; System.out.println(filePrefix); Scanner scanner = connector.createScanner("!METADATA", Constants.NO_AUTHS); //TODO: auths? scanner.setRange(new Range(new Text(tableId + "\u0000"), new Text(tableId + "\uFFFD"))); scanner.fetchColumnFamily(new Text("file")); List<String> files = new ArrayList<String>(); List<InputSplit> fileSplits = new ArrayList<InputSplit>(); Job job = new Job(conf); for (Map.Entry<Key, Value> entry : scanner) { String file = filePrefix + entry.getKey().getColumnQualifier().toString(); files.add(file); Path path = new Path(file); FileStatus fileStatus = fs.getFileStatus(path); long len = fileStatus.getLen(); BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, len); fileSplits.add(new FileSplit(path, 0, len, fileBlockLocations[0].getHosts())); // FileInputFormat.addInputPath(job, path); } System.out.println(files); return fileSplits; // return super.getSplits(job); } catch (Exception e) { throw new IOException(e); } }
From source file:org.apache.accumulo.server.test.performance.scan.CollectTabletStats.java
License:Apache License
private static void reportHdfsBlockLocations(List<String> files) throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); System.out.println("\t\tFile block report : "); for (String file : files) { FileStatus status = fs.getFileStatus(new Path(file)); if (status.isDir()) { // assume it is a map file status = fs.getFileStatus(new Path(file + "/data")); }/*from w ww . j a va 2 s . co m*/ BlockLocation[] locs = fs.getFileBlockLocations(status, 0, status.getLen()); System.out.println("\t\t\tBlocks for : " + file); for (BlockLocation blockLocation : locs) { System.out.printf("\t\t\t\t offset : %,13d hosts :", blockLocation.getOffset()); for (String host : blockLocation.getHosts()) { System.out.print(" " + host); } System.out.println(); } } System.out.println(); }
From source file:org.apache.accumulo.server.util.LocalityCheck.java
License:Apache License
private void addBlocks(VolumeManager fs, String host, ArrayList<String> files, Map<String, Long> totalBlocks, Map<String, Long> localBlocks) throws Exception { long allBlocks = 0; long matchingBlocks = 0; if (!totalBlocks.containsKey(host)) { totalBlocks.put(host, 0L);/*ww w . j a v a2s.c om*/ localBlocks.put(host, 0L); } for (String file : files) { Path filePath = new Path(file); FileSystem ns = fs.getVolumeByPath(filePath).getFileSystem(); FileStatus fileStatus = ns.getFileStatus(filePath); BlockLocation[] fileBlockLocations = ns.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); for (BlockLocation blockLocation : fileBlockLocations) { allBlocks++; for (String location : blockLocation.getHosts()) { HostAndPort hap = HostAndPort.fromParts(location, 0); if (hap.getHostText().equals(host)) { matchingBlocks++; break; } } } } totalBlocks.put(host, allBlocks + totalBlocks.get(host)); localBlocks.put(host, matchingBlocks + localBlocks.get(host)); }
From source file:org.apache.accumulo.test.performance.scan.CollectTabletStats.java
License:Apache License
private static void reportHdfsBlockLocations(List<FileRef> files) throws Exception { VolumeManager fs = VolumeManagerImpl.get(); System.out.println("\t\tFile block report : "); for (FileRef file : files) { FileStatus status = fs.getFileStatus(file.path()); if (status.isDirectory()) { // assume it is a map file status = fs.getFileStatus(new Path(file + "/data")); }/* w w w. ja va 2 s .c o m*/ FileSystem ns = fs.getVolumeByPath(file.path()).getFileSystem(); BlockLocation[] locs = ns.getFileBlockLocations(status, 0, status.getLen()); System.out.println("\t\t\tBlocks for : " + file); for (BlockLocation blockLocation : locs) { System.out.printf("\t\t\t\t offset : %,13d hosts :", blockLocation.getOffset()); for (String host : blockLocation.getHosts()) { System.out.print(" " + host); } System.out.println(); } } System.out.println(); }
From source file:org.apache.asterix.external.util.HDFSUtils.java
License:Apache License
/** * Instead of creating the split using the input format, we do it manually * This function returns fileSplits (1 per hdfs file block) irrespective of the number of partitions * and the produced splits only cover intersection between current files in hdfs and files stored internally * in AsterixDB/*from ww w . j ava 2 s . c om*/ * 1. NoOp means appended file * 2. AddOp means new file * 3. UpdateOp means the delta of a file * @return * @throws IOException */ public static InputSplit[] getSplits(JobConf conf, List<ExternalFile> files) throws IOException { // Create file system object FileSystem fs = FileSystem.get(conf); ArrayList<FileSplit> fileSplits = new ArrayList<FileSplit>(); ArrayList<ExternalFile> orderedExternalFiles = new ArrayList<ExternalFile>(); // Create files splits for (ExternalFile file : files) { Path filePath = new Path(file.getFileName()); FileStatus fileStatus; try { fileStatus = fs.getFileStatus(filePath); } catch (FileNotFoundException e) { // file was deleted at some point, skip to next file continue; } if (file.getPendingOp() == ExternalFilePendingOp.PENDING_ADD_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) { // Get its information from HDFS name node BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, file.getSize()); // Create a split per block for (BlockLocation block : fileBlocks) { if (block.getOffset() < file.getSize()) { fileSplits.add(new FileSplit(filePath, block.getOffset(), (block.getLength() + block.getOffset()) < file.getSize() ? block.getLength() : (file.getSize() - block.getOffset()), block.getHosts())); orderedExternalFiles.add(file); } } } else if (file.getPendingOp() == ExternalFilePendingOp.PENDING_NO_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) { long oldSize = 0L; long newSize = file.getSize(); for (int i = 0; i < files.size(); i++) { if (files.get(i).getFileName() == file.getFileName() && files.get(i).getSize() != file.getSize()) { newSize = files.get(i).getSize(); oldSize = file.getSize(); break; } } // Get its information from HDFS name node BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, newSize); // Create a split per block for (BlockLocation block : fileBlocks) { if (block.getOffset() + block.getLength() > oldSize) { if (block.getOffset() < newSize) { // Block interact with delta -> Create a split long startCut = (block.getOffset() > oldSize) ? 0L : oldSize - block.getOffset(); long endCut = (block.getOffset() + block.getLength() < newSize) ? 0L : block.getOffset() + block.getLength() - newSize; long splitLength = block.getLength() - startCut - endCut; fileSplits.add(new FileSplit(filePath, block.getOffset() + startCut, splitLength, block.getHosts())); orderedExternalFiles.add(file); } } } } } fs.close(); files.clear(); files.addAll(orderedExternalFiles); return fileSplits.toArray(new FileSplit[fileSplits.size()]); }