List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.talis.hadoop.rdf.ZipUtils.java
License:Apache License
public static void packZipFile(Path perm, Path temp, Configuration conf, FileSystem fs) throws IOException { FSDataOutputStream out = null;//www . j a v a 2s .c om ZipOutputStream zos = null; int zipCount = 0; LOG.info("Packing zip file for " + perm); try { out = fs.create(perm, false); zos = new ZipOutputStream(out); String name = perm.getName().replaceAll(".zip$", ""); LOG.info("adding index directory" + temp); zipCount = zipDirectory(conf, zos, "", temp.toString(), temp); } catch (Throwable t) { LOG.error("packZipFile exception", t); if (t instanceof RuntimeException) { throw (RuntimeException) t; } if (t instanceof IOException) { throw (IOException) t; } throw new IOException(t); } finally { if (zos != null) { if (zipCount == 0) { // If no entries were written, only close out, as // the zip will throw an error LOG.error("No entries written to zip file " + perm); fs.delete(perm, false); // out.close(); } else { LOG.info(String.format("Wrote %d items to %s for %s", zipCount, perm, temp)); zos.close(); } } } }
From source file:com.talis.mapreduce.dicenc.ThirdDriver.java
License:Apache License
@Override public boolean accept(Path path) { return !path.getName().startsWith("dict"); }
From source file:com.tamingtext.util.SplitInput.java
License:Apache License
/** Perform a split on the specified input file. Results will be written to files of the same name in the specified * training and test output directories. The {@link #validate()} method is called prior to executing the split. *//*from w w w . j a va 2s . c o m*/ public void splitFile(Path inputFile) throws IOException { if (fs.getFileStatus(inputFile) == null) { throw new IOException(inputFile + " does not exist"); } else if (fs.getFileStatus(inputFile).isDir()) { throw new IOException(inputFile + " is a directory"); } validate(); Path testOutputFile = new Path(testOutputDirectory, inputFile.getName()); Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName()); int lineCount = countLines(fs, inputFile, charset); log.info("{} has {} lines", inputFile.getName(), lineCount); int testSplitStart = 0; int testSplitSize = this.testSplitSize; // don't modify state BitSet randomSel = null; if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) { testSplitSize = this.testRandomSelectionSize; if (testRandomSelectionPct > 0) { testSplitSize = Math.round(lineCount * (testRandomSelectionPct / 100.0f)); } log.info("{} test split size is {} based on random selection percentage {}", new Object[] { inputFile.getName(), testSplitSize, testRandomSelectionPct }); long[] ridx = new long[testSplitSize]; RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom()); randomSel = new BitSet(lineCount); for (long idx : ridx) { randomSel.set((int) idx + 1); } } else { if (testSplitPct > 0) { // calculate split size based on percentage testSplitSize = Math.round(lineCount * (testSplitPct / 100.0f)); log.info("{} test split size is {} based on percentage {}", new Object[] { inputFile.getName(), testSplitSize, testSplitPct }); } else { log.info("{} test split size is {}", inputFile.getName(), testSplitSize); } if (splitLocation > 0) { // calculate start of split based on percentage testSplitStart = Math.round(lineCount * (splitLocation / 100.0f)); if (lineCount - testSplitStart < testSplitSize) { // adjust split start downwards based on split size. testSplitStart = lineCount - testSplitSize; } log.info("{} test split start is {} based on split location {}", new Object[] { inputFile.getName(), testSplitStart, splitLocation }); } if (testSplitStart < 0) { throw new IllegalArgumentException( "test split size for " + inputFile + " is too large, it would produce an " + "empty training set from the initial set of " + lineCount + " examples"); } else if ((lineCount - testSplitSize) < testSplitSize) { log.warn( "Test set size for {} may be too large, {} is larger than the number of " + "lines remaining in the training set: {}", new Object[] { inputFile, testSplitSize, lineCount - testSplitSize }); } } BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset); int pos = 0; int trainCount = 0; int testCount = 0; String line; while ((line = reader.readLine()) != null) { pos++; Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } writer.write(line); writer.write('\n'); } IOUtils.close(Collections.singleton(trainingWriter)); IOUtils.close(Collections.singleton(testWriter)); log.info("file: {}, input: {} train: {}, test: {} starting at {}", new Object[] { inputFile.getName(), lineCount, trainCount, testCount, testSplitStart }); // testing; if (callback != null) { callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart); } }
From source file:com.taobao.datax.plugins.common.DFSUtils.java
License:Open Source License
/** * Delete file specified by path.//from w w w .j a va 2 s .c o m * * @param path * {@link Path} in hadoop * * @param flag * need to do delete recursively * @throws IOException * * */ public static void deleteFile(Path path, boolean flag) throws IOException { log.debug("deleting:" + path.getName()); fs.delete(path, flag); }
From source file:com.taobao.datax.plugins.common.DFSUtils.java
License:Open Source License
/** * Delete file specified by path.// w ww . j a v a 2 s. c o m * * @param dfs * handle of {@link FileSystem} * * @param path * {@link Path} in hadoop * * @param flag * need to do delete recursively * @throws IOException * * */ public static void deleteFile(FileSystem dfs, Path path, boolean flag) throws IOException { log.debug("deleting:" + path.getName()); dfs.delete(path, flag); }
From source file:com.TCG.Nutch_DNS.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*from www. j a va 2s . c om*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.thinkbiganalytics.spark.io.ZipStreamingOutput.java
License:Apache License
/** * Creates a new ZIP entry for the specified path. *//* w w w . jav a2s .c o m*/ @Nonnull private ZipEntry createEntry(@Nonnull final Path path) { return new ZipEntry(path.getName()); }
From source file:com.toy.Client.java
License:Apache License
private static void registerLocalResource(Map<String, LocalResource> localResources, ApplicationId appId, FileSystem fs, Path src) throws IOException { String pathSuffix = Constants.TOY_PREFIX + appId.toString() + "/" + src.getName(); Path dst = new Path(fs.getHomeDirectory(), pathSuffix); LOG.info("Copy {} from local filesystem to {} and add to local environment", src.getName(), dst.toUri()); fs.copyFromLocalFile(false, true, src, dst); FileStatus destStatus = fs.getFileStatus(dst); LocalResource amJarRsrc = Records.newRecord(LocalResource.class); amJarRsrc.setType(LocalResourceType.FILE); amJarRsrc.setVisibility(LocalResourceVisibility.APPLICATION); amJarRsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst)); amJarRsrc.setTimestamp(destStatus.getModificationTime()); amJarRsrc.setSize(destStatus.getLen()); localResources.put(src.getName(), amJarRsrc); }
From source file:com.trendmicro.hdfs.webdav.HDFSResource.java
License:Apache License
@Override public String getHref() { StringBuilder sb = new StringBuilder(); Path p = this.path; while (p != null && !("".equals(p.getName()))) { sb.insert(0, p.getName());// w ww . j av a2 s . c o m sb.insert(0, "/"); p = p.getParent(); } if (sb.length() == 0) { sb.insert(0, "/"); } return sb.toString(); }
From source file:com.tripadvisor.hadoop.BackupHdfs.java
License:Apache License
/** * Method to go though the HDFS filesystem in a DFS to find all * files/*from w w w . j a va 2 s . co m*/ * * fs:FileSystem object from HDFS * minDate: Oldest date for files to be backed up * maxDate:Newest date for files to be backed up * p:Path in HDFS to look for files * pathList:Will be filled with all files in p * hmTimestamps: hashmap of timestamps for later sorting **/ public void checkDir(FileSystem fs, long minDate, long maxDate, Path p, ArrayList<Path> pathList, HashMap<Path, Long> hmTimestamps) { long tmpDate; FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } // dump the mkdir and chmod commands for this // directory -- skip root directory only { FileStatus stat = fs.getFileStatus(p); if (!sPath.equals("/")) { m_wrMkdirs.println("hadoop fs -mkdir " + sPath); } m_wrChmods.println("hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); Short sh = new Short(stat.getPermission().toShort()); m_wrChmods.println("hadoop fs -chmod " + Long.toOctalString(sh.longValue()) + " " + sPath); } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, minDate, maxDate, fStat[i].getPath(), pathList, hmTimestamps); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // another database to regular hive tables to // partitioned hive tables. We use table names to // both exclude some from the backup, and for the rest // to dump out the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { m_nIgnoredTables++; if (m_nIgnoredTables < 5) { System.out.println("Skipping ignore-table file: " + sPath); } else if (m_nIgnoredTables == 5) { System.out.println("(...not showing other skipped tables...)"); } return; } FileStatus stat = fs.getFileStatus(p); tmpDate = stat.getModificationTime() / 1000; // store the chmods/chowns for all files m_wrChmods.println("hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); m_wrChmods.println("hadoop fs -chmod " + stat.getPermission().toShort() + " " + sPath); // check dates. is it too young? if (tmpDate < minDate) { return; } // is the file too recent? if (tmpDate > maxDate) { //System.out.println("file too recent: " + sPath); return; } // file timestamp is ok pathList.add(p); hmTimestamps.put(p, new Long(tmpDate)); // store info about total bytes neeed to backup m_nTotalBytes += fs.getContentSummary(p).getLength(); } } catch (IOException e) { System.err.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }