List of usage examples for org.apache.hadoop.fs Path getParent
public Path getParent()
From source file:com.inmobi.messaging.consumer.util.HadoopUtil.java
License:Apache License
public static void setUpHadoopFiles(Path streamDirPrefix, Configuration conf, String[] files, String[] suffixDirs, Path[] finalFiles, boolean alternateEmptyFiles, Date minuteDirTimeStamp, int index, int startIndex) throws Exception { FileSystem fs = streamDirPrefix.getFileSystem(conf); Path rootDir = streamDirPrefix.getParent(); Path tmpDataDir = new Path(rootDir, "data"); boolean emptyFile = false; // setup data dirs if (files != null) { int i = startIndex; int j = index; for (String file : files) { if (alternateEmptyFiles && emptyFile) { MessageUtil.createEmptySequenceFile(file, fs, tmpDataDir, conf); emptyFile = false;// w ww. j av a 2s . c om } else { MessageUtil.createMessageSequenceFile(file, fs, tmpDataDir, i, conf); emptyFile = true; i += 100; } Path srcPath = new Path(tmpDataDir, file); Date commitTime = getCommitDateForFile(file, minuteDirTimeStamp); TestUtil.publishMissingPaths(fs, streamDirPrefix, lastCommitTime, commitTime); lastCommitTime = commitTime; Path targetDateDir = getTargetDateDir(streamDirPrefix, commitTime); List<Path> targetDirs = new ArrayList<Path>(); if (suffixDirs != null) { for (String suffixDir : suffixDirs) { targetDirs.add(new Path(targetDateDir, suffixDir)); } } else { targetDirs.add(targetDateDir); } for (Path targetDir : targetDirs) { fs.mkdirs(targetDir); Path targetPath = new Path(targetDir, file); fs.copyFromLocalFile(srcPath, targetPath); LOG.info("Copied " + srcPath + " to " + targetPath); if (finalFiles != null) { finalFiles[j] = targetPath; j++; } Thread.sleep(1000); } fs.delete(srcPath, true); } TestUtil.publishLastPath(fs, streamDirPrefix, lastCommitTime); } }
From source file:com.inmobi.messaging.consumer.util.HadoopUtil.java
License:Apache License
public static void setupHadoopCluster(Configuration conf, String[] files, String[] suffixDirs, Path[] finalFiles, Path finalDir, boolean withEmptyFiles, boolean createFilesInNextHour) throws Exception { FileSystem fs = finalDir.getFileSystem(conf); Path rootDir = finalDir.getParent(); fs.delete(rootDir, true);//from w w w . j a v a 2 s . c o m Path tmpDataDir = new Path(rootDir, "data"); fs.mkdirs(tmpDataDir); if (!createFilesInNextHour) { setUpHadoopFiles(finalDir, conf, files, suffixDirs, finalFiles, withEmptyFiles, null, 0, 0); } else { // start from 1 hour back as we need files in two diff hours. Calendar cal = Calendar.getInstance(); cal.setTime(startCommitTime); cal.add(Calendar.HOUR_OF_DAY, -1); setUpHadoopFiles(finalDir, conf, files, suffixDirs, finalFiles, withEmptyFiles, cal.getTime(), 0, 0); // go to next hour cal.add(Calendar.HOUR_OF_DAY, 1); int index = files.length; // find number of non empty(i.e. data) files in 1 hour int numberOfNonEmptyFiles = withEmptyFiles ? (int) Math.ceil(index / 2.0) : index; int startIndex = numberOfNonEmptyFiles * 100; setUpHadoopFiles(finalDir, conf, files, suffixDirs, finalFiles, withEmptyFiles, cal.getTime(), index, startIndex); } }
From source file:com.intel.hibench.streambench.FileDataGenNew.java
License:Apache License
private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset) throws IOException { System.out.println("Opening files, path:" + pt + " offset:" + offset); RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false); Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>(); while (rit.hasNext()) { Path path = rit.next().getPath(); String filename = path.toString().substring(path.getParent().toString().length(), path.toString().length()); if (filename.startsWith("/part-")) { long filesize = fs.getFileStatus(path).getLen(); if (offset < filesize) { FSDataInputStream handle = fs.open(path); if (offset > 0) { handle.seek(offset); }/*from www . java2 s . c om*/ fileHandleList.add(handle); } offset -= filesize; } } if (fileHandleList.size() == 1) return fileHandleList.get(0); else if (fileHandleList.size() > 1) { Enumeration<FSDataInputStream> enu = fileHandleList.elements(); return new SequenceInputStream(enu); } else { System.err.println("Error, no source file loaded. run genSeedDataset.sh first!"); return null; } }
From source file:com.intropro.prairie.unit.hdfs.HdfsUnit.java
License:Apache License
public <T> void saveAs(InputStream inputStream, String dstPath, Format<T> inputFormat, Format<T> outputFormat) throws IOException { Path outPath = new Path(dstPath); getFileSystem().mkdirs(outPath.getParent()); FSDataOutputStream fsDataOutputStream = getFileSystem().create(outPath); try {/*from w w w . j a v a 2s. co m*/ InputFormatReader<T> inputFormatReader = inputFormat.createReader(inputStream); OutputFormatWriter<T> outputFormatWriter = outputFormat.createWriter(fsDataOutputStream); T line; while ((line = inputFormatReader.next()) != null) { outputFormatWriter.write(line); } inputFormatReader.close(); outputFormatWriter.close(); } catch (FormatException e) { throw new IOException(e); } }
From source file:com.jaeksoft.searchlib.crawler.cache.HadoopCrawlCache.java
License:Open Source License
private Path checkPath(Path path) throws IOException { if (!fileSystem.exists(path)) { Path parent = path.getParent(); if (!fileSystem.exists(parent)) fileSystem.mkdirs(parent);// www .j a va2 s . c o m } return path; }
From source file:com.jkoolcloud.tnt4j.streams.inputs.HdfsFileLineStream.java
License:Apache License
/** * Searches for files matching name pattern. Name pattern also may contain path of directory, where file search * should be performed, e.g., C:/Tomcat/logs/localhost_access_log.*.txt. If no path is defined (just file name * pattern) then files are searched in {@code System.getProperty("user.dir")}. Files array is ordered by file create * timestamp in descending order.//w w w . j ava 2s . c o m * * @param path * path of file * @param fs * file system * * @return array of found files paths. * @throws IOException * if files can't be listed by file system. * * @see FileSystem#listStatus(Path, PathFilter) * @see FilenameUtils#wildcardMatch(String, String, IOCase) */ public static Path[] searchFiles(Path path, FileSystem fs) throws IOException { FileStatus[] dir = fs.listStatus(path.getParent(), new PathFilter() { @Override public boolean accept(Path path) { String name = path.getName(); return FilenameUtils.wildcardMatch(name, "*", IOCase.INSENSITIVE); // NON-NLS } }); Path[] activityFiles = new Path[dir == null ? 0 : dir.length]; if (dir != null) { Arrays.sort(dir, new Comparator<FileStatus>() { @Override public int compare(FileStatus o1, FileStatus o2) { return Long.valueOf(o1.getModificationTime()).compareTo(o2.getModificationTime()) * (-1); } }); for (int i = 0; i < dir.length; i++) { activityFiles[i] = dir[i].getPath(); } } return activityFiles; }
From source file:com.kadwa.hadoop.DistExec.java
License:Open Source License
/** * Initialize ExecFilesMapper specific job-configuration. * * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments// ww w .j a va 2s.c om * @return true if it is necessary to launch a job. */ private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); jobConf.set(EXEC_CMD_LABEL, args.execCmd); //set boolean values jobConf.setBoolean(Options.REDIRECT_ERROR_TO_OUT.propertyname, args.flags.contains(Options.REDIRECT_ERROR_TO_OUT)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path stagingArea; try { stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf); } catch (InterruptedException e) { throw new IOException(e); } Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId); FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION); FileSystem.mkdirs(FileSystem.get(jobDirectory.toUri(), conf), jobDirectory, mapredSysPerms); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); FileSystem dstfs = args.dst.getFileSystem(conf); // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_" + NAME + "_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_" + NAME + "_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_" + NAME + "_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_" + NAME + "_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory. final boolean special = (args.srcs.size() == 1 && !dstExists); int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_" + NAME + "_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_" + NAME + "_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); LOG.info("sourcePathsCount=" + srcCount); LOG.info("filesToExecCount=" + fileCount); LOG.info("bytesToExecCount=" + StringUtils.humanReadableInt(byteCount)); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(fileCount, jobConf); return fileCount > 0; }
From source file:com.kylinolap.dict.DictionaryManager.java
License:Apache License
private String unpackDataSet(String tempHDFSDir, String dataSetName) throws IOException { InputStream in = this.getClass().getResourceAsStream("/com/kylinolap/dict/" + dataSetName + ".txt"); if (in == null) // data set resource not found return null; ByteArrayOutputStream buf = new ByteArrayOutputStream(); IOUtils.copy(in, buf);/* w ww . j av a 2s. co m*/ in.close(); byte[] bytes = buf.toByteArray(); Path tmpDataSetPath = new Path( tempHDFSDir + "/dict/temp_dataset/" + dataSetName + "_" + bytes.length + ".txt"); FileSystem fs = HadoopUtil.getFileSystem(tempHDFSDir); boolean writtenNewFile = false; if (fs.exists(tmpDataSetPath) == false || fs.getFileStatus(tmpDataSetPath).getLen() != bytes.length) { fs.mkdirs(tmpDataSetPath.getParent()); FSDataOutputStream out = fs.create(tmpDataSetPath); IOUtils.copy(new ByteArrayInputStream(bytes), out); out.close(); writtenNewFile = true; } String qualifiedPath = tmpDataSetPath.makeQualified(fs.getUri(), new Path("/")).toString(); if (writtenNewFile) logger.info("Dictionary temp data set file written to " + qualifiedPath); return qualifiedPath; }
From source file:com.liferay.hadoop.action.HadoopJob.java
License:Open Source License
public String doExecute(HttpServletRequest request, HttpServletResponse response) throws Exception { response.setContentType(ContentTypes.TEXT_PLAIN_UTF8); PrintWriter writer = response.getWriter(); FileSystem fileSystem = HadoopManager.getFileSystem(); JobClient jobClient = HadoopManager.getJobClient(); writer.println("-- Job Status --"); Path inputPath = new Path("/index/*/*"); Path outputPath = new Path("/wordcount/results"); try {/*from w w w.java 2s. c o m*/ if (_runningJob == null) { writer.println("Creating job"); if (fileSystem.exists(_jobPath)) { fileSystem.delete(_jobPath, false); } if (!fileSystem.exists(_jobPath)) { writer.println("Deploying the job code to cluster"); FSDataOutputStream outputStream = null; try { outputStream = fileSystem.create(_jobPath); ServletContext servletContext = HadoopManager.getServletContext(); InputStream inputStream = servletContext.getResourceAsStream("/WEB-INF/lib/hadoop-job.jar"); StreamUtil.transfer(inputStream, outputStream, false); } finally { StreamUtil.cleanUp(outputStream); } writer.println("Job code deployed to cluster"); } if (fileSystem.exists(outputPath)) { writer.println("A previous job output was found, backing it up"); fileSystem.rename(outputPath, outputPath.getParent().suffix("/.results-" + System.currentTimeMillis())); } _jobConf = HadoopManager.createNewJobConf(); _jobConf.setJobName("Word Count"); writer.println("Job '" + _jobConf.getJobName() + "' is being configured"); _jobConf.setJarByClass(Map.class); _jobConf.setOutputKeyClass(Text.class); _jobConf.setOutputValueClass(IntWritable.class); _jobConf.setMapperClass(Map.class); _jobConf.setCombinerClass(Reduce.class); _jobConf.setReducerClass(Reduce.class); _jobConf.setInputFormat(TextInputFormat.class); _jobConf.setOutputFormat(TextOutputFormat.class); writer.println("Job code deployed to distributed cache's classpath"); DistributedCache.addArchiveToClassPath(_jobPath, _jobConf, fileSystem); FileInputFormat.setInputPaths(_jobConf, inputPath); FileOutputFormat.setOutputPath(_jobConf, outputPath); writer.println("Submitting job the first time"); _runningJob = jobClient.submitJob(_jobConf); writer.println("Job submitted"); } int jobState = _runningJob.getJobState(); writer.println( "Job status: " + jobState + " (RUNNING = 1, SUCCEEDED = 2, FAILED = 3, PREP = 4, KILLED = 5)"); if ((jobState != JobStatus.RUNNING) && (jobState != JobStatus.PREP)) { writer.println("Re-issuing the job"); if (fileSystem.exists(outputPath)) { writer.println("A previous job output was found, backing it up"); fileSystem.rename(outputPath, outputPath.getParent().suffix("/.results-" + System.currentTimeMillis())); } writer.println("Submitting job the first time"); _runningJob = jobClient.submitJob(_jobConf); writer.println("Job submitted"); } } catch (Exception ioe) { writer.println("Job error: "); ioe.printStackTrace(writer); } writer.flush(); writer.close(); return null; }
From source file:com.liferay.hadoop.store.HDFSStore.java
License:Open Source License
@Override public void deleteDirectory(long companyId, long repositoryId, String dirName) throws PortalException, SystemException { Path fullPath = HadoopManager.getFullDirPath(companyId, repositoryId, dirName); try {/*from ww w.ja va 2 s . c om*/ FileSystem fileSystem = HadoopManager.getFileSystem(); fileSystem.delete(fullPath, true); Path parentPath = fullPath.getParent(); deleteEmptyAncestors(parentPath); } catch (IOException ioe) { throw new SystemException(ioe); } }