List of usage examples for org.apache.hadoop.fs FileSystem exists
public boolean exists(Path f) throws IOException
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Injector.java
public void inject(Path crawlDir, ArrayList<String> urls) throws IOException, InterruptedException, ClassNotFoundException, Exception { Path crawldb = new Path(crawlDir, "crawldb"); Configuration config = CrawlerConfiguration.create(); System.out.println(config.get("mapred.jar")); FileSystem fs = crawldb.getFileSystem(config); Path tempdb = new Path(crawldb, "temp"); if (fs.exists(tempdb)) { fs.delete(tempdb);/*w w w . ja v a 2s . c om*/ } SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, new Path(tempdb, "info.avro"), Text.class, CrawlDatum.class); for (String url : urls) { CrawlDatum crawldatum = new CrawlDatum(); crawldatum.setUrl(url); crawldatum.setStatus(CrawlDatum.STATUS_DB_INJECTED); writer.append(new Text(url), crawldatum); System.out.println("inject:" + url); } writer.close(); String[] args = new String[] { crawldb.toString(), tempdb.toString() }; ToolRunner.run(CrawlerConfiguration.create(), new Merge(), args); Merge.install(crawldb); if (fs.exists(tempdb)) { fs.delete(tempdb); } }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java
public static Job createJob(Configuration conf, Path crawldb) throws IOException { Job job = new Job(conf); //job.setJarByClass(Merge.class); job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path newdb = new Path(crawldb, "new"); Path currentdb = new Path(crawldb, "current"); FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create()); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }// w w w .j av a 2 s . c om if (fs.exists(newdb)) { fs.delete(newdb); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job; }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java
public static void install(Path crawldb) throws IOException { FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create()); Path newdb = new Path(crawldb, "new"); Path currentdb = new Path(crawldb, "current"); Path olddb = new Path(crawldb, "old"); if (fs.exists(currentdb)) { if (fs.exists(olddb)) { fs.delete(olddb);/*from w ww . j a va 2s. c o m*/ } fs.rename(currentdb, olddb); } fs.mkdirs(crawldb); fs.rename(newdb, currentdb); }
From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java
License:Apache License
/** Run a map/reduce job to compute Pi. */ private static void compute(int startDigit, int nDigits, int nMaps, String workingDir, Configuration conf, PrintStream out) throws IOException { final String name = startDigit + "_" + nDigits; // setup wroking directory out.println("Working Directory = " + workingDir); out.println();/*from ww w .ja v a 2 s .c o m*/ // final FileSystem fs = FileSystem.get(conf);// ? final FileSystem fs = new Path(workingDir, "part-r-00000").getFileSystem(conf);// ? final Path dir = fs.makeQualified(new Path(workingDir)); if (fs.exists(dir)) { throw new IOException("Working directory " + dir + " already exists. Please remove it first."); } else if (!fs.mkdirs(dir)) { throw new IOException("Cannot create working directory " + dir); } out.println("Start Digit = " + startDigit); out.println("Number of Digits = " + nDigits); out.println("Number of Maps = " + nMaps); // setup a job final Job job = createJob(name, conf); final Path hexfile = new Path(dir, "pi_" + name + ".hex"); FileOutputFormat.setOutputPath(job, new Path(dir, "out")); // setup custom properties job.getConfiguration().set(WORKING_DIR_PROPERTY, dir.toString()); job.getConfiguration().set(HEX_FILE_PROPERTY, hexfile.toString()); job.getConfiguration().setInt(DIGIT_START_PROPERTY, startDigit); job.getConfiguration().setInt(DIGIT_SIZE_PROPERTY, nDigits); job.getConfiguration().setInt(DIGIT_PARTS_PROPERTY, nMaps); // start a map/reduce job out.println("\nStarting Job ..."); final long startTime = System.currentTimeMillis(); try { if (!job.waitForCompletion(true)) { out.println("Job failed."); System.exit(1); } } catch (Exception e) { throw new RuntimeException(e); } finally { final double duration = (System.currentTimeMillis() - startTime) / 1000.0; out.println("Duration is " + duration + " seconds."); } out.println("Output file: " + hexfile); }
From source file:cn.jpush.hdfs.mr.example.WordMedian.java
License:Apache License
/** * This is a standard program to read and find a median value based on a * file of word counts such as: 1 456, 2 132, 3 56... Where the first values * are the word lengths and the following values are the number of times * that words of that length appear./*w w w .j a v a 2 s . c o m*/ * * @param path * The path to read the HDFS file from * (part-r-00000...00001...etc). * @param medianIndex1 * The first length value to look for. * @param medianIndex2 * The second length value to look for (will be the same as the * first if there are an even number of words total). * @throws IOException * If file cannot be found, we throw an exception. * */ private double readAndFindMedian(String path, int medianIndex1, int medianIndex2, Configuration conf) throws IOException { // FileSystem fs = FileSystem.get(conf);// ? FileSystem fs = new Path(path, "part-r-00000").getFileSystem(conf);// ? Path file = new Path(path, "part-r-00000"); if (!fs.exists(file)) throw new IOException("Output not found!"); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(fs.open(file), Charsets.UTF_8)); int num = 0; String line; while ((line = br.readLine()) != null) { StringTokenizer st = new StringTokenizer(line); // grab length String currLen = st.nextToken(); // grab count String lengthFreq = st.nextToken(); int prevNum = num; num += Integer.parseInt(lengthFreq); if (medianIndex2 >= prevNum && medianIndex1 <= num) { System.out.println("The median is: " + currLen); br.close(); return Double.parseDouble(currLen); } else if (medianIndex2 >= prevNum && medianIndex1 < num) { String nextCurrLen = st.nextToken(); double theMedian = (Integer.parseInt(currLen) + Integer.parseInt(nextCurrLen)) / 2.0; System.out.println("The median is: " + theMedian); br.close(); return theMedian; } } } finally { if (br != null) { br.close(); } } // error, no median found return -1; }
From source file:co.cask.cdap.common.logging.SyncTest.java
License:Apache License
@Test @Ignore/*from w w w . ja va 2s. c om*/ public void testSync() throws IOException { FileSystem fs = FileSystem.get(config); // create a file and write n bytes, then sync Path path = new Path("/myfile"); FSDataOutputStream out = fs.create(path, false, 4096, (short) 2, 4096L); int numBytes = 5000; for (int i = 0; i < numBytes; i++) { out.write((byte) i); } out.hflush(); // verify the file is there Assert.assertTrue(fs.exists(path)); // do not verify the length of the file, hflush() does not update that //Assert.assertEquals(numBytes, fs.getFileStatus(path).getLen()); // read back and verify all bytes FSDataInputStream in = fs.open(path); byte[] buffer = new byte[numBytes]; in.readFully(buffer); for (int i = 0; i < numBytes; i++) { Assert.assertEquals((byte) i, buffer[i]); } in.close(); // now close the writer out.close(); }
From source file:co.cask.cdap.data.hbase.HBase10CDH550Test.java
License:Apache License
@Override public HRegion createHRegion(byte[] tableName, byte[] startKey, byte[] stopKey, String callingMethod, Configuration conf, byte[]... families) throws IOException { if (conf == null) { conf = new Configuration(); }/*from w w w .j av a 2 s.c om*/ HTableDescriptor htd = new HTableDescriptor(tableName); for (byte[] family : families) { htd.addFamily(new HColumnDescriptor(family)); } HRegionInfo info = new HRegionInfo(htd.getTableName(), startKey, stopKey, false); Path path = new Path(conf.get(HConstants.HBASE_DIR), callingMethod); FileSystem fs = FileSystem.get(conf); if (fs.exists(path)) { if (!fs.delete(path, true)) { throw new IOException("Failed delete of " + path); } } return HRegion.createHRegion(info, path, conf, htd); }
From source file:co.cask.cdap.data.hbase.HBase94Test.java
License:Apache License
@Override public HRegion createHRegion(byte[] tableName, byte[] startKey, byte[] stopKey, String callingMethod, Configuration conf, byte[]... families) throws IOException { if (conf == null) { conf = new Configuration(); }/*from w ww . ja va 2 s .c om*/ HTableDescriptor htd = new HTableDescriptor(tableName); for (byte[] family : families) { htd.addFamily(new HColumnDescriptor(family)); } HRegionInfo info = new HRegionInfo(htd.getName(), startKey, stopKey, false); Path path = new Path(conf.get(HConstants.HBASE_DIR), callingMethod); FileSystem fs = FileSystem.get(conf); if (fs.exists(path)) { if (!fs.delete(path, true)) { throw new IOException("Failed delete of " + path); } } return HRegion.createHRegion(info, path, conf, htd); }
From source file:co.cask.cdap.data.tools.HBaseTableExporter.java
License:Apache License
/** * Sets up the actual MapReduce job.//from w w w . j a va2s .co m * @param tx The transaction which needs to be passed to the Scan instance. This transaction is be used by * coprocessors to filter out the data corresonding to the invalid transactions . * @param tableName Name of the table which need to be exported as HFiles. * @return the configured job * @throws IOException */ public Job createSubmittableJob(Transaction tx, String tableName) throws IOException { Job job = Job.getInstance(hConf, "HBaseTableExporter"); job.setJarByClass(HBaseTableExporter.class); Scan scan = new Scan(); scan.setCacheBlocks(false); // Set the transaction attribute for the scan. scan.setAttribute(TxConstants.TX_OPERATION_ATTRIBUTE_KEY, new TransactionCodec().encode(tx)); job.setNumReduceTasks(0); TableMapReduceUtil.initTableMapperJob(tableName, scan, KeyValueImporter.class, null, null, job); FileSystem fs = FileSystem.get(hConf); Random rand = new Random(); Path root = new Path(fs.getWorkingDirectory(), "hbasetableexporter"); fs.mkdirs(root); while (true) { bulkloadDir = new Path(root, "" + rand.nextLong()); if (!fs.exists(bulkloadDir)) { break; } } HFileOutputFormat2.setOutputPath(job, bulkloadDir); HTable hTable = new HTable(hConf, tableName); HFileOutputFormat2.configureIncrementalLoad(job, hTable); return job; }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java
License:Apache License
@Override public void commitJob(JobContext context) throws IOException { Configuration configuration = context.getConfiguration(); MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration); BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext); String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET); PartitionedFileSet outputDataset = taskContext.getDataset(outputDatasetName); Partitioning partitioning = outputDataset.getPartitioning(); Set<PartitionKey> partitionsToAdd = new HashSet<>(); Set<String> relativePaths = new HashSet<>(); // Go over all files in the temporary directory and keep track of partitions to add for them FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context); for (FileStatus committedTaskPath : allCommittedTaskPaths) { FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration); RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true); while (fileIter.hasNext()) { Path path = fileIter.next().getPath(); String relativePath = getRelative(committedTaskPath.getPath(), path); int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR); if (lastPathSepIdx == -1) { // this shouldn't happen because each relative path should consist of at least one partition key and // the output file name LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path, relativePath);/*w ww . ja v a 2s . c o m*/ continue; } // relativePath = "../key1/key2/part-m-00000" // relativeDir = "../key1/key2" // fileName = "part-m-00000" String relativeDir = relativePath.substring(0, lastPathSepIdx); String fileName = relativePath.substring(lastPathSepIdx + 1); Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir); Path finalPath = new Path(finalDir, fileName); if (fs.exists(finalPath)) { throw new FileAlreadyExistsException("Final output path " + finalPath + " already exists"); } PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir); partitionsToAdd.add(partitionKey); relativePaths.add(relativeDir); } } // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to // the original outputDir. Path finalOutput = FileOutputFormat.getOutputPath(context); FileSystem fs = finalOutput.getFileSystem(configuration); for (FileStatus stat : getAllCommittedTaskPaths(context)) { mergePaths(fs, stat, finalOutput); } // compute the metadata to be written to every output partition Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(), PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX); // create all the necessary partitions for (PartitionKey partitionKey : partitionsToAdd) { PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey); partitionOutput.setMetadata(metadata); partitionOutput.addPartition(); } // close the TaskContext, which flushes dataset operations try { taskContext.flushOperations(); } catch (Exception e) { Throwables.propagateIfPossible(e, IOException.class); throw new IOException(e); } // delete the job-specific _temporary folder and create a _done file in the o/p folder cleanupJob(context); // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true) if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) { for (String relativePath : relativePaths) { Path pathToMark = new Path(finalOutput, relativePath); Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME); fs.createNewFile(markerPath); } } }