List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.edwardsit.spark4n6.EWFFileReaderTest.java
License:Apache License
@Test public void testGetEWFSection() throws IOException { log.setLevel(Level.DEBUG);/*from ww w .ja v a 2s . c o m*/ Logger.getLogger("com.edwardsit.spark4n6") .addAppender(new RollingFileAppender(new PatternLayout(), "debug.log")); Configuration conf = new Configuration(false); Path path = new Path("../macwd.E01"); // Path path = new Path("D:\\Users\\Derek\\Images\\500GB\\500GB-CDrive.E01"); FileSystem fs = path.getFileSystem(conf); EWFFileReader reader = new EWFFileReader(fs, path); long size = reader.getImageSize(); ArrayList<EWFSection.SectionPrefix> sections = reader.getSectionPrefixArray(); Iterator<EWFSection.SectionPrefix> it = sections.iterator(); EWFSection.SectionPrefix sp; long numSplits = 10L; long priorStart = 0L; long priorEnd = 0L; Path priorFile = null; log.debug(path.getName() + ": imageSize = " + size); log.debug("File\t\tChunkIndex\t\tSectionType\t\tChunkCount\t\tSectionSize"); while (it.hasNext()) { sp = it.next(); assertNotNull(sp); log.debug(sp.file + "\t\t" + sp.chunkIndex + "\t\t" + sp.sectionType + "\t\t" + sp.chunkCount + "\t\t" + sp.sectionSize); if (!sp.file.equals(priorFile) && sp.sectionType.equals(EWFSection.SectionType.TABLE_TYPE)) { if (priorFile != null) { priorEnd = sp.chunkIndex; // log.debug(priorFile + "Split#" + (numSplits * priorEnd * 64 * 512 / size) + ", " + priorStart + " to " + priorEnd); } priorFile = sp.file; priorStart = sp.chunkIndex; } } // log.debug(priorFile + " Split#" + (numSplits * priorEnd * 64 * 512 / size) + ", " + priorStart + " to " + size / 64 / 512); }
From source file:com.edwardsit.spark4n6.EWFImageInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { log.setLevel(Level.DEBUG);//from www.ja v a2 s. c om List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); BlockLocation[] blkLocations = null; Path path = null; FileSystem fs = null; EWFFileReader ewf = null; ArrayList<EWFSection.SectionPrefix> sections = null; Iterator<EWFSection.SectionPrefix> it = null; EWFSection.SectionPrefix sp = null; Path priorFile = null; long priorOffset = 0L; FileStatus priorFileStatus = null; chunkSize = new EWFSegmentFileReader(fs).DEFAULT_CHUNK_SIZE; long priorStart = 0L; int blkIndex = 0; for (FileStatus file : files) { path = file.getPath(); fs = path.getFileSystem(job.getConfiguration()); if (path.getName().endsWith(".E01")) { ewf = new EWFFileReader(fs, path); sections = ewf.getSectionPrefixArray(); it = sections.iterator(); while (it.hasNext()) { sp = it.next(); if (sp.sectionType.equals(EWFSection.SectionType.TABLE_TYPE)) { priorFileStatus = fs.getFileStatus(priorFile); for (long i = sp.chunkCount; i > 0L; i = i - getChunksPerSplit(priorFileStatus)) { if (priorFileStatus instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) priorFileStatus).getBlockLocations(); } else { blkLocations = fs.getFileBlockLocations(priorFileStatus, priorOffset, (getChunksPerSplit(priorFileStatus) * chunkSize)); } blkIndex = getBlockIndex(blkLocations, priorOffset); if (i > getChunksPerSplit(priorFileStatus)) { log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize) + ", " + (getChunksPerSplit(priorFileStatus) * chunkSize) + ", " + listHosts(blkLocations, blkIndex) + ");"); splits.add(makeSplit(priorFile, (priorStart * chunkSize), (getChunksPerSplit(priorFileStatus) * chunkSize), blkLocations[blkIndex].getHosts())); priorStart += getChunksPerSplit(priorFileStatus); } else { log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize) + ", " + (i * chunkSize) + ", " + listHosts(blkLocations, blkIndex) + ");"); splits.add(makeSplit(priorFile, (priorStart * chunkSize), (i * chunkSize), blkLocations[blkIndex].getHosts())); priorStart += i; } } } priorFile = sp.file; priorOffset = sp.fileOffset; } } } return splits; }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them. * This will use constant memory and will run at the speed of your disk read *//*from w w w . j a v a2s . c om*/ private static List<Path> createDictionaryChunks(Path dictPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(dictPath.toUri(), conf); FileStatus[] dictFiles = fs.listStatus(dictPath, new PathFilter() { @Override public boolean accept(Path path) { String name = path.getName(); return name.startsWith("dictionary.") && !name.endsWith(".crc"); } }); for (int i = 0; i < dictFiles.length; i++) { chunkPaths.add(dictFiles[i].getPath()); } return chunkPaths; }
From source file:com.ery.dimport.daemon.TaskManager.java
License:Apache License
public void runTask(final TaskInfo task) { List<LogHostRunInfoPO> allFiles = new ArrayList<LogHostRunInfoPO>(); try {//from w w w . j ava 2s .c o m task.START_TIME = new Date(System.currentTimeMillis()); boolean needUpdate = false; TaskInfo exists = allTask.get(task.TASK_ID); if (exists == null) { needUpdate = true; } else { task.hosts = exists.hosts; } if (task.hosts == null || task.hosts.size() == 0) { task.hosts = new ArrayList<String>(master.getServerManager().getOnlineServers().keySet()); needUpdate = true; } if (ZKUtil.checkExists(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID) == -1) { needUpdate = true; } if (needUpdate) { try { task.HOST_SIZE = task.hosts.size(); master.logWriter.writeLog(task); ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID, DImportConstant.Serialize(task)); } catch (Throwable e) { } } Thread thread = Thread.currentThread(); ProcessInfo procInfo = null; synchronized (taskInProgress) { procInfo = taskInProgress.get(task.getRunTaskId()); } procInfo.thread = thread; procInfo.startTime = System.currentTimeMillis(); procInfo.startTime = System.currentTimeMillis(); String filePath = task.FILE_PATH; boolean isInHdfs = false; final Map<String, Long> files = new HashMap<String, Long>(); String tmpPath = conf.get(DImportConstant.DIMPORT_PROCESS_TMPDATA_DIR, System.getProperty("user.home")); if (tmpPath.endsWith("/")) { tmpPath = tmpPath.substring(0, tmpPath.length() - 1); } if (filePath == null || filePath.equals("")) { files.put("", 0l); } else { if (task.fileNamePattern != null || (task.FILE_FILTER != null && !task.FILE_FILTER.equals(""))) { task.FILE_FILTER = DImportConstant.macroProcess(task.FILE_FILTER); task.FILE_FILTER = task.FILE_FILTER.replaceAll("\\{host\\}", this.master.hostName); task.fileNamePattern = Pattern.compile(task.FILE_FILTER); } Matcher m = hdfsUrlPattern.matcher(filePath); if (m.matches()) { isInHdfs = true; filePath = m.group(2); // for (String string : conf.getValByRegex(".*").keySet()) { // System.out.println(string + "=" + conf.get(string)); // } Path dirPath = new Path(filePath); FileSystem fs = FileSystem.get(HadoopConf.getConf(conf)); if (!fs.exists(dirPath) || !fs.isDirectory(dirPath)) { throw new IOException("HDFS? " + filePath + "?,?"); } FileStatus[] hFiles = fs.listStatus(dirPath, new PathFilter() { @Override public boolean accept(Path name) { if (task.fileNamePattern != null) { System.out.println("hdfs listStatus:" + name.getParent() + "/" + name.getName()); return task.fileNamePattern.matcher(name.getName()).matches(); } else { return true; } } }); for (int i = 0; i < hFiles.length; i++) { files.put(hFiles[i].getPath().toString(), hFiles[i].getLen()); } } else { java.io.File f = new File(filePath); if (!f.exists() || !f.isDirectory()) { throw new IOException( "? " + filePath + "? ,?"); } File[] lFiles = f.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { if (task.fileNamePattern != null) { System.out.println("local fs listStatus:" + dir + "/" + name); return task.fileNamePattern.matcher(name).matches(); } else { return true; } } }); for (int i = 0; i < lFiles.length; i++) { files.put(lFiles[i].getAbsolutePath(), lFiles[i].length()); } } } for (String fileName : files.keySet()) { LogHostRunInfoPO runInfo = new LogHostRunInfoPO(task); runInfo.RUN_LOG_ID = DImportConstant.shdf.format(task.SUBMIT_TIME) + "_" + allFiles.size() + "_" + fileName.hashCode(); runInfo.FILE_NAME = fileName; runInfo.RETURN_CODE = 255; runInfo.IS_RUN_SUCCESS = -1; runInfo.FILE_SIZE = files.get(fileName); runInfo.HOST_NAME = master.hostName; String localFile = fileName; if (isInHdfs) {// localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1); } // String[] cmds = procInfo.task.getCommand(); for (int j = 0; j < cmds.length; j++) { cmds[j] = DImportConstant.macroProcess(cmds[j]); cmds[j] = cmds[j].replaceAll("\\{file\\}", localFile); cmds[j] = cmds[j].replaceAll("\\{host\\}", master.hostName); } runInfo.RUN_COMMAND = StringUtils.join(" ", cmds); master.logWriter.writeLog(runInfo); LOG.info("??" + runInfo); allFiles.add(runInfo); } ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, DImportConstant.Serialize(allFiles)); for (LogHostRunInfoPO runInfo : allFiles) { if (procInfo.stoped) break; String fileName = runInfo.FILE_NAME; LOG.info("?:" + fileName); procInfo.RUN_LOG_ID = runInfo.RUN_LOG_ID; runInfo.START_TIME = new Date(System.currentTimeMillis()); procInfo.processFile = fileName; String localFile = fileName; try { if (isInHdfs) {// localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1); } procInfo.task.TASK_COMMAND = runInfo.RUN_COMMAND; if (isInHdfs) {// File lf = new File(localFile); if (lf.exists()) lf.delete(); FileSystem fs = FileSystem.get(HadoopConf.getConf(conf)); LOG.info("HDFS:" + fileName + "===>" + localFile); long btime = System.currentTimeMillis(); fs.copyToLocalFile(new Path(fileName), new Path(localFile)); LOG.info("HDFS?:" + fileName + "===>" + localFile); runInfo.downTime = System.currentTimeMillis() - btime; fileName = localFile; } updateHostInfoLog(runInfo, allFiles); LOG.info(procInfo.task.TASK_NAME + " commandline: " + procInfo.task.TASK_COMMAND); procInfo.proc = execResult(runInfo.RUN_COMMAND); runInfo.IS_RUN_SUCCESS = 1; runInfo.RETURN_CODE = writeProcessLog(procInfo); LOG.info(procInfo.task.TASK_NAME + " return value: " + runInfo.RETURN_CODE); // runInfo.RETURN_CODE = procInfo.proc.exitValue(); } catch (Throwable e) { runInfo.ERROR_MSG = e.getMessage(); if (procInfo.proc != null) { try { procInfo.proc.destroy(); } catch (Exception ex) { } } procInfo.proc = null; LOG.error("", e); } finally { // runInfo.END_TIME = new Date(System.currentTimeMillis()); master.logWriter.updateLog(runInfo); updateHostInfoLog(runInfo, allFiles); ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, DImportConstant.Serialize(allFiles)); if (isInHdfs) { File lf = new File(localFile); if (lf.exists()) lf.delete(); } } } } catch (Throwable e) { LOG.error("" + task, e); try { if (allFiles.size() > 0) { for (LogHostRunInfoPO logHostRunInfoPO : allFiles) { if (logHostRunInfoPO.END_TIME.getTime() < 10000) { logHostRunInfoPO.END_TIME = new Date(System.currentTimeMillis()); logHostRunInfoPO.IS_RUN_SUCCESS = 1; logHostRunInfoPO.RETURN_CODE = 2; } } ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, DImportConstant.Serialize(allFiles)); } } catch (KeeperException e1) { LOG.error("update task run info on host :" + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, e); } catch (IOException e1) { LOG.error("update task run info on host " + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName, e); } } finally { // synchronized (taskInProgress) { taskInProgress.remove(task.getRunTaskId()); } } }
From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java
License:Apache License
void openFile() throws IOException { start = split.getStart();/* ww w.j a v a 2 s .com*/ end = start + split.getLength(); final Path file = split.getPath(); LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing=" + fileEncodeing + " " + split.getStart() + ":" + split.getLength()); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); if (file.getName().endsWith(".zip")) { LOG.info("use ZipInputStream read file " + split.getPath()); ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing)); in = new LineReader(zin, job); filePosition = fileIn; codec = new GzipCodec(); return; } if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); // tar.gzTarInputStream // new TarInputStream(codec.createInputStream(fileIn, // decompressor) String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(cIn), job); } else { in = new LineReader(cIn, job); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { String filename = file.getName(); if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } filePosition = fileIn; } } else { fileIn.seek(start); String filename = file.getName(); if (filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(fileIn), job); } else { in = new LineReader(fileIn, job); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.ery.hadoop.mrddx.hFile.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from w w w. jav a2 s .c o m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); // tar.gzTarInputStream // new TarInputStream(codec.createInputStream(fileIn, decompressor) String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(cIn), job); } else { in = new LineReader(cIn, job); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.facebook.presto.hive.metastore.file.FileHiveMetastore.java
License:Apache License
private List<Path> getChildSchemaDirectories(Path metadataDirectory) { try {//from www .j av a 2 s . co m if (!metadataFileSystem.isDirectory(metadataDirectory)) { return ImmutableList.of(); } ImmutableList.Builder<Path> childSchemaDirectories = ImmutableList.builder(); for (FileStatus child : metadataFileSystem.listStatus(metadataDirectory)) { if (!child.isDirectory()) { continue; } Path childPath = child.getPath(); if (childPath.getName().startsWith(".")) { continue; } if (metadataFileSystem.isFile(new Path(childPath, PRESTO_SCHEMA_FILE_NAME))) { childSchemaDirectories.add(childPath); } } return childSchemaDirectories.build(); } catch (IOException e) { throw new PrestoException(HIVE_METASTORE_ERROR, e); } }
From source file:com.facebook.presto.hive.metastore.SemiTransactionalHiveMetastore.java
License:Apache License
private static RecursiveDeleteResult doRecursiveDeleteFiles(FileSystem fileSystem, Path directory, List<String> filePrefixes, boolean deleteEmptyDirectories) { FileStatus[] allFiles;/*w ww . j av a 2 s. co m*/ try { allFiles = fileSystem.listStatus(directory); } catch (IOException e) { ImmutableList.Builder<String> notDeletedItems = ImmutableList.builder(); notDeletedItems.add(directory.toString() + "/**"); return new RecursiveDeleteResult(false, notDeletedItems.build()); } boolean allDescendentsDeleted = true; ImmutableList.Builder<String> notDeletedEligibleItems = ImmutableList.builder(); for (FileStatus fileStatus : allFiles) { if (HadoopFileStatus.isFile(fileStatus)) { Path filePath = fileStatus.getPath(); String fileName = filePath.getName(); boolean eligible = false; for (String filePrefix : filePrefixes) { if (fileName.startsWith(filePrefix)) { eligible = true; break; } } if (eligible) { if (!deleteIfExists(fileSystem, filePath, false)) { allDescendentsDeleted = false; notDeletedEligibleItems.add(filePath.toString()); } } else { allDescendentsDeleted = false; } } else if (HadoopFileStatus.isDirectory(fileStatus)) { RecursiveDeleteResult subResult = doRecursiveDeleteFiles(fileSystem, fileStatus.getPath(), filePrefixes, deleteEmptyDirectories); if (!subResult.isDirectoryNoLongerExists()) { allDescendentsDeleted = false; } if (!subResult.getNotDeletedEligibleItems().isEmpty()) { notDeletedEligibleItems.addAll(subResult.getNotDeletedEligibleItems()); } } else { allDescendentsDeleted = false; notDeletedEligibleItems.add(fileStatus.getPath().toString()); } } if (allDescendentsDeleted && deleteEmptyDirectories) { verify(notDeletedEligibleItems.build().isEmpty()); if (!deleteIfExists(fileSystem, directory, false)) { return new RecursiveDeleteResult(false, ImmutableList.of(directory.toString() + "/")); } return new RecursiveDeleteResult(true, ImmutableList.of()); } return new RecursiveDeleteResult(false, notDeletedEligibleItems.build()); }
From source file:com.facebook.presto.hive.PrestoS3FileSystem.java
License:Apache License
@Override public FileStatus getFileStatus(Path path) throws IOException { if (path.getName().isEmpty()) { // the bucket root requires special handling if (getS3ObjectMetadata(path) != null) { return new FileStatus(0, true, 1, 0, 0, qualifiedPath(path)); }/* w w w .j a v a2 s.c om*/ throw new FileNotFoundException("File does not exist: " + path); } ObjectMetadata metadata = getS3ObjectMetadata(path); if (metadata == null) { // check if this path is a directory Iterator<LocatedFileStatus> iterator = listPrefix(path); if ((iterator != null) && iterator.hasNext()) { return new FileStatus(0, true, 1, 0, 0, qualifiedPath(path)); } throw new FileNotFoundException("File does not exist: " + path); } return new FileStatus(metadata.getContentLength(), false, 1, BLOCK_SIZE.toBytes(), lastModifiedTime(metadata), qualifiedPath(path)); }
From source file:com.facebook.presto.hive.s3.PrestoS3FileSystem.java
License:Apache License
@Override public FileStatus getFileStatus(Path path) throws IOException { if (path.getName().isEmpty()) { // the bucket root requires special handling if (getS3ObjectMetadata(path) != null) { return new FileStatus(0, true, 1, 0, 0, qualifiedPath(path)); }//from ww w .j a va 2 s .com throw new FileNotFoundException("File does not exist: " + path); } ObjectMetadata metadata = getS3ObjectMetadata(path); if (metadata == null) { // check if this path is a directory Iterator<LocatedFileStatus> iterator = listPrefix(path); if (iterator.hasNext()) { return new FileStatus(0, true, 1, 0, 0, qualifiedPath(path)); } throw new FileNotFoundException("File does not exist: " + path); } return new FileStatus(getObjectSize(path, metadata), false, 1, BLOCK_SIZE.toBytes(), lastModifiedTime(metadata), qualifiedPath(path)); }