List of usage examples for org.apache.hadoop.fs FileSystem exists
public boolean exists(Path f) throws IOException
From source file:cc.solr.lucene.store.hdfs.ConvertDirectory.java
License:Apache License
public static void convert(Path path) throws IOException { FileSystem fileSystem = FileSystem.get(path.toUri(), new Configuration()); if (!fileSystem.exists(path)) { System.out.println(path + " does not exists."); return;//from ww w . j a v a 2 s .c om } FileStatus fileStatus = fileSystem.getFileStatus(path); if (fileStatus.isDir()) { FileStatus[] listStatus = fileSystem.listStatus(path); for (FileStatus status : listStatus) { convert(status.getPath()); } } else { System.out.println("Converting file [" + path + "]"); HdfsMetaBlock block = new HdfsMetaBlock(); block.realPosition = 0; block.logicalPosition = 0; block.length = fileStatus.getLen(); FSDataOutputStream outputStream = fileSystem.append(path); block.write(outputStream); outputStream.writeInt(1); outputStream.writeLong(fileStatus.getLen()); outputStream.writeInt(HdfsFileWriter.VERSION); outputStream.close(); } }
From source file:cc.solr.lucene.store.hdfs.HdfsFileReader.java
License:Apache License
public HdfsFileReader(FileSystem fileSystem, Path path, int bufferSize) throws IOException { if (!fileSystem.exists(path)) { throw new FileNotFoundException(path.toString()); }/* w w w .ja v a2 s . c o m*/ FileStatus fileStatus = fileSystem.getFileStatus(path); _hdfsLength = fileStatus.getLen(); _inputStream = fileSystem.open(path, bufferSize); // read meta blocks _inputStream.seek(_hdfsLength - 16); int numberOfBlocks = _inputStream.readInt(); _length = _inputStream.readLong(); int version = _inputStream.readInt(); if (version != VERSION) { throw new RuntimeException("Version of file [" + version + "] does not match reader [" + VERSION + "]"); } _inputStream.seek(_hdfsLength - 16 - (numberOfBlocks * 24)); // 3 longs per // block _metaBlocks = new ArrayList<HdfsMetaBlock>(numberOfBlocks); for (int i = 0; i < numberOfBlocks; i++) { HdfsMetaBlock hdfsMetaBlock = new HdfsMetaBlock(); hdfsMetaBlock.readFields(_inputStream); _metaBlocks.add(hdfsMetaBlock); } seek(0); }
From source file:chapter5.KMeanSample.java
License:Apache License
/** * Return the path to the final iteration's clusters *//*from w w w . j a va 2 s . c o m*/ private static Path finalClusterPath(Configuration conf, Path output, int maxIterations) throws IOException { FileSystem fs = FileSystem.get(conf); for (int i = maxIterations; i >= 0; i--) { Path clusters = new Path(output, "clusters-" + i); if (fs.exists(clusters)) { return clusters; } } return null; }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.DBReader.java
public static void main(String[] args) throws Exception { Path crawlPath = new Path("task2"); Path currentPath = new Path(crawlPath, "crawldb/current"); Path output = new Path("output"); Configuration config = CrawlerConfiguration.create(); FileSystem fs = FileSystem.get(config); if (fs.exists(output)) { fs.delete(output);/*ww w .j av a2 s. co m*/ } Job job = new Job(config); job.setJobName("dbreader " + crawlPath.toString()); job.setMapperClass(DBReaderMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, currentPath); FileOutputFormat.setOutputPath(job, output); job.waitForCompletion(true); }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.Injector.java
public static void inject(Path crawlPath, CrawlDatums datums, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException, Exception { Path crawldbPath = new Path(crawlPath, "crawldb"); FileSystem fs = FileSystem.get(conf); Path tempdb = new Path(crawldbPath, "temp"); if (fs.exists(tempdb)) { fs.delete(tempdb);//from ww w .j av a2 s. co m } SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(tempdb, "info"), Text.class, CrawlDatum.class); for (CrawlDatum datum : datums) { String key = datum.getKey(); writer.append(new Text(key), datum); LOG.info("inject:" + key); } writer.close(); Path[] mergePaths = new Path[] { tempdb }; Merge.merge(crawlPath, mergePaths, conf, "inject"); Merge.install(crawlPath, conf); if (fs.exists(tempdb)) { fs.delete(tempdb); } }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.Merge.java
public static void merge(Path crawlPath, Path[] mergePaths, Configuration conf, String jobName) throws Exception { Job job = new Job(conf); job.setJobName(jobName + " " + crawlPath.toString()); job.setJarByClass(Merge.class); // job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path crawldbPath = new Path(crawlPath, "crawldb"); Path newdb = new Path(crawldbPath, "new"); Path currentdb = new Path(crawldbPath, "current"); FileSystem fs = FileSystem.get(conf); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }/*from ww w . j a v a2 s .c om*/ if (fs.exists(newdb)) { fs.delete(newdb); } for (Path mergePath : mergePaths) { FileInputFormat.addInputPath(job, mergePath); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.waitForCompletion(true); }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.Merge.java
public static void install(Path crawlPath, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(conf); Path crawldbPath = new Path(crawlPath, "crawldb"); Path newdb = new Path(crawldbPath, "new"); Path currentdb = new Path(crawldbPath, "current"); Path olddb = new Path(crawldbPath, "old"); if (fs.exists(currentdb)) { if (fs.exists(olddb)) { fs.delete(olddb);/*from www . ja v a 2 s .c o m*/ } fs.rename(currentdb, olddb); } fs.rename(newdb, currentdb); }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.SegmentUtil.java
public static void initSegments(Path crawlPath, Configuration conf) throws IOException { Path segmentsPath = new Path(crawlPath, "segments"); FileSystem fs = FileSystem.get(conf); if (!fs.exists(segmentsPath)) { fs.mkdirs(segmentsPath);/*from w w w .ja v a 2 s .c o m*/ } }
From source file:cn.edu.hfut.dmic.webcollectorcluster.crawler.Crawler.java
public void start(int depth) throws Exception { Configuration conf = CrawlerConfiguration.create(); FileSystem fs = crawlDir.getFileSystem(conf); if (!resumable) { if (fs.exists(crawlDir)) { fs.delete(crawlDir);//from ww w .j ava 2 s .c o m } } inject(); for (int i = 0; i < depth; i++) { LogUtils.getLogger().info("starting depth " + (i + 1)); String segmentName = SegmentUtils.createSegmengName(); Path segmentPath = new Path(segments, segmentName); String[] args = new String[] { crawldb.toString(), segmentPath.toString() }; ToolRunner.run(CrawlerConfiguration.create(), new Fetcher(), args); ToolRunner.run(CrawlerConfiguration.create(), new DbUpdater(), args); } }
From source file:cn.edu.hfut.dmic.webcollectorcluster.fetcher.Fetcher.java
@Override public int run(String[] args) throws Exception { JobConf jc = new JobConf(getConf()); jc.setJarByClass(Fetcher.class); jc.setInputFormat(SequenceFileInputFormat.class); Path input = new Path(args[0], "current"); Path output = new Path(args[1]); Configuration conf = CrawlerConfiguration.create(); FileSystem fs = output.getFileSystem(conf); if (fs.exists(output)) { fs.delete(output);//from www .ja v a 2s.c o m } FileInputFormat.addInputPath(jc, input); FileOutputFormat.setOutputPath(jc, output); jc.setMapOutputKeyClass(Text.class); jc.setMapOutputValueClass(WebWritable.class); jc.setMapRunnerClass(Fetcher.class); jc.setOutputFormat(FetcherOutputFormat.class); JobClient.runJob(jc); return 0; }