List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.twitter.hraven.etl.JobFileModifiedRangeSubstringPathFilter.java
License:Apache License
@Override public boolean accept(Path path) { if (!super.accept(path)) { return false; }//from www . j a v a 2 s .c om JobFile jobFile = new JobFile(path.getName()); if (jobFile.isJobConfFile() || jobFile.isJobHistoryFile()) { if (jobFile.isJobHistoryFile()) { if (!includesPathSubstrings(path) || !excludesPathSubstrings(path)) { return false; } } try { FileSystem fs = path.getFileSystem(myConf); FileStatus fileStatus = fs.getFileStatus(path); long fileModificationTimeMillis = fileStatus.getModificationTime(); return accept(fileModificationTimeMillis); } catch (IOException e) { throw new ImportException("Cannot determine file modification time of " + path.getName(), e); } } else { // Reject anything that does not match a job conf filename. LOG.info(" Not a valid job conf / job history file " + path.getName()); return false; } }
From source file:com.twitter.hraven.etl.JobFilePartitioner.java
License:Apache License
/** * @param hdfs/*from w ww. j a v a 2 s . c om*/ * FileSystem handle * @param outputPath * base directory where files to be written to * @param fileModTime * of the file that needs to be moved/copied to hdfs * @return the existing path in HDFS to write to the file to. Will be created * if it does not exist. * @throws IOException * if the year/month/day directory with cannot be created in * outputPath. */ private Path getTargetDirectory(FileSystem hdfs, Path outputPath, long fileModTime) throws IOException { String year = YEAR_FORMAT.format(new Date(fileModTime)); String month = MONTH_FORMAT.format(new Date(fileModTime)); String day = DAY_FORMAT.format(new Date(fileModTime)); Path yearDir = new Path(outputPath, year); Path monthDir = new Path(yearDir, month); Path dayDir = new Path(monthDir, day); // Check if the directory already exists, if not, then insert a record into // HBase for it. if (!hdfs.exists(dayDir)) { if (hdfs.mkdirs(dayDir)) { LOG.info("Created: " + dayDir.toString()); } else { throw new IOException("Unable to create target directory with date: " + dayDir.getName()); } } return dayDir; }
From source file:com.twitter.hraven.etl.JobFilePartitioner.java
License:Apache License
/** * @param hdfs//from w w w . jav a 2 s. c o m * FileSystem handle * @param f * file to copy to HDFS * @param outputPath * @param skipExisting * skip if the file already exist in the target. File will be * overwritten if already there and this argument is false. * @throws IOException * if target directory cannot be created or file cannot be copied to * target directory. */ private void processPlainFile(FileSystem hdfs, File f, Path outputPath, boolean skipExisting) throws IOException { long fileModTime = f.lastModified(); Path targetDir = getTargetDirectory(hdfs, outputPath, fileModTime); boolean doCopy = true; Path sourceFile = new Path(f.getPath()); if (skipExisting) { Path target = new Path(targetDir, sourceFile.getName()); if (hdfs.exists(target)) { doCopy = false; } } if (doCopy) { hdfs.copyFromLocalFile(sourceFile, targetDir); } }
From source file:com.twitter.hraven.etl.JobFilePathFilter.java
License:Apache License
@Override public boolean accept(Path path) { // Ideally we want to do this // JobFile jobFile = new JobFile(path.getName()); // return (jobFile.isJobConfFile() || jobFile.isJobHistoryFile()); // Aside from that not being efficient, it also chokes on input directories. // therefore, allow anythying but CRC files. The record reader will have to deal with the rest. return !((path == null) || (path.getName().endsWith(".crc"))); }
From source file:com.twitter.hraven.etl.ProcessRecordService.java
License:Apache License
/** * @param initialProcessFile// w w w . ja v a 2 s.com * The path to the file to be moved. * @param outputPath * The path where this file is to be moved to. * @return the new path or null if the rename failed. * @throws IOException * when bad things happen. * @throws ProcessingException * when the file cannot be moved. */ public Path moveProcessFile(Path initialProcessFile, Path outputPath) throws IOException { String processFileName = initialProcessFile.getName(); Path processFile = new Path(outputPath, processFileName); boolean success = fs.rename(initialProcessFile, processFile); if (!success) { throw new ProcessingException( "Unable to move processing file " + initialProcessFile + " to " + processFile); } return processFile; }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
@Test public void testPruneFileListBySize() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[2]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os);//w w w . j a va 2s. c o m assertTrue(hdfs.exists(inputPath)); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[0] = hdfs.getFileStatus(expPath); final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml"; File jobConfFile = new File(JOB_CONF_FILE_NAME); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[1] = hdfs.getFileStatus(expPath); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 0); Path emptyFile = new Path( inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist"); os = hdfs.createNewFile(emptyFile); assertTrue(os); assertTrue(hdfs.exists(emptyFile)); origList[0] = hdfs.getFileStatus(emptyFile); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[1] = hdfs.getFileStatus(emptyConfFile); prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 2); }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
/** * removes conf file which has already been put in prunedList * * @throws IOException/* w w w . jav a 2s. co m*/ */ @Test public void testPruneFileListRemovingConfFromPruneList() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[2]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize_pruneList"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os); assertTrue(hdfs.exists(inputPath)); Path relocationPath = new Path("/relocation_filesize_pruneList"); os = hdfs.mkdirs(relocationPath); assertTrue(os); assertTrue(hdfs.exists(relocationPath)); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329348432655_0001_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[0] = hdfs.getFileStatus(emptyConfFile); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[1] = hdfs.getFileStatus(expPath); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 0); }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
/** * tests the case when several files are spread out in the dir and need to be removed * * @throws IOException/* www . j av a 2 s.co m*/ */ @Test public void testPruneFileListMultipleFilesAlreadyMovedCases() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[12]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize_multiple"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os); assertTrue(hdfs.exists(inputPath)); Path relocationPath = new Path("/relocation_filesize_multiple"); os = hdfs.mkdirs(relocationPath); assertTrue(os); assertTrue(hdfs.exists(relocationPath)); Path emptyFile = new Path( inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist"); os = hdfs.createNewFile(emptyFile); assertTrue(os); assertTrue(hdfs.exists(emptyFile)); origList[0] = hdfs.getFileStatus(emptyFile); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[1] = hdfs.getFileStatus(emptyConfFile); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[2] = hdfs.getFileStatus(expPath); final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml"; File jobConfFile = new File(JOB_CONF_FILE_NAME); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[3] = hdfs.getFileStatus(expPath); Path inputPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221-1311111143227-user10101-WordCount-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath2); assertTrue(hdfs.exists(inputPath2)); origList[4] = hdfs.getFileStatus(inputPath2); Path inputPath3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991-1311111143227-user3321-TeraGen-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath3); assertTrue(hdfs.exists(inputPath3)); origList[5] = hdfs.getFileStatus(inputPath3); Path inputPath4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771-1311111143227-user3321-TeraSort-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath4); assertTrue(hdfs.exists(inputPath4)); origList[6] = hdfs.getFileStatus(inputPath4); Path emptyFile2 = new Path( inputPath.toUri() + "/" + "job_1329343333333_5551-1329111113227-user2-SomethingElse.jhist"); os = hdfs.createNewFile(emptyFile2); assertTrue(os); assertTrue(hdfs.exists(emptyFile2)); origList[7] = hdfs.getFileStatus(emptyFile2); Path emptyConfFile2 = new Path(inputPath.toUri() + "/" + "job_1329343333333_5551_conf.xml"); os = hdfs.createNewFile(emptyConfFile2); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile2)); origList[8] = hdfs.getFileStatus(emptyConfFile2); // this is an empty file which tests the toBeRemovedFileList // at the end of function pruneFileListBySize Path emptyConfFile3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991_conf.xml"); os = hdfs.createNewFile(emptyConfFile3); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile3)); origList[9] = hdfs.getFileStatus(emptyConfFile3); Path inputConfPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221_conf.xml"); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputConfPath2); assertTrue(hdfs.exists(inputConfPath2)); origList[10] = hdfs.getFileStatus(inputConfPath2); // this is an empty file which tests the toBeRemovedFileList // at the end of function pruneFileListBySize Path emptyConfFile4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771_conf.xml"); os = hdfs.createNewFile(emptyConfFile4); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile4)); origList[11] = hdfs.getFileStatus(emptyConfFile4); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 4); }
From source file:com.twitter.hraven.mapreduce.JobFileTableMapper.java
License:Apache License
/** * calculates the cost of this job based on mbMillis, machineType * and cost details from the properties file * @param mbMillis//from w w w. j a va 2s . co m * @param currentConf * @return cost of the job */ private Double getJobCost(Long mbMillis, Configuration currentConf) { Double computeTco = 0.0; Long machineMemory = 0L; Properties prop = null; String machineType = currentConf.get(Constants.HRAVEN_MACHINE_TYPE, "default"); LOG.debug(" machine type " + machineType); try { Path[] cacheFiles = DistributedCache.getLocalCacheFiles(currentConf); if (null != cacheFiles && cacheFiles.length > 0) { for (Path cachePath : cacheFiles) { LOG.debug(" distributed cache path " + cachePath); if (cachePath.getName().equals(Constants.COST_PROPERTIES_FILENAME)) { prop = loadCostProperties(cachePath, machineType); break; } } } else { LOG.error("Unable to find anything (" + Constants.COST_PROPERTIES_FILENAME + ") in distributed cache, continuing with defaults"); } } catch (IOException ioe) { LOG.error("IOException reading from distributed cache for " + Constants.COST_PROPERTIES_HDFS_DIR + ", continuing with defaults" + ioe.toString()); } if (prop != null) { String computeTcoStr = prop.getProperty(machineType + ".computecost"); try { computeTco = Double.parseDouble(computeTcoStr); } catch (NumberFormatException nfe) { LOG.error("error in conversion to long for compute tco " + computeTcoStr + " using default value of 0"); } String machineMemStr = prop.getProperty(machineType + ".machinememory"); try { machineMemory = Long.parseLong(machineMemStr); } catch (NumberFormatException nfe) { LOG.error("error in conversion to long for machine memory " + machineMemStr + " using default value of 0"); } } else { LOG.error("Could not load properties file, using defaults"); } Double jobCost = JobHistoryFileParserBase.calculateJobCost(mbMillis, computeTco, machineMemory); LOG.info("from cost properties file, jobCost is " + jobCost + " based on compute tco: " + computeTco + " machine memory: " + machineMemory + " for machine type " + machineType); return jobCost; }
From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezJobControlCompiler.java
License:Apache License
public static void setupDistributedCache(PigContext pigContext, Configuration conf, String[] paths, boolean shipToCluster) throws IOException { // Turn on the symlink feature DistributedCache.createSymlink(conf); for (String path : paths) { path = path.trim();/*from ww w . j a v a 2s . c o m*/ if (path.length() != 0) { Path src = new Path(path); // Ensure that 'src' is a valid URI URI srcURI = toURI(src); // Ship it to the cluster if necessary and add to the // DistributedCache if (shipToCluster) { Path dst = new Path(FileLocalizer.getTemporaryPath(pigContext).toString()); FileSystem fs = dst.getFileSystem(conf); fs.copyFromLocalFile(src, dst); // Construct the dst#srcName uri for DistributedCache URI dstURI = null; try { dstURI = new URI(dst.toString() + "#" + src.getName()); } catch (URISyntaxException ue) { byte errSrc = pigContext.getErrorSource(); int errCode = 0; switch (errSrc) { case PigException.REMOTE_ENVIRONMENT: errCode = 6004; break; case PigException.USER_ENVIRONMENT: errCode = 4004; break; default: errCode = 2037; break; } String msg = "Invalid ship specification. " + "File doesn't exist: " + dst; throw new ExecException(msg, errCode, errSrc); } DistributedCache.addCacheFile(dstURI, conf); } else { DistributedCache.addCacheFile(srcURI, conf); } } } }