List of usage examples for org.apache.hadoop.fs FileSystem mkdirs
public boolean mkdirs(Path f) throws IOException
From source file:ivory.driver.PreprocessMedline.java
License:Apache License
/** * Runs this tool./* ww w . ja v a2s . c om*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String collection = args[0]; String indexPath = args[1]; int numMappers = Integer.parseInt(args[2]); int numReducers = Integer.parseInt(args[3]); sLogger.info("Tool name: ProcessMedline"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexPath); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexPath); if (!fs.exists(p)) { sLogger.info("index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { sLogger.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { collection, indexPath + "/medline-docid-tmp", mappingFile.toString(), new Integer(numMappers).toString() }; NumberMedlineCitations tool = new NumberMedlineCitations(); tool.setConf(conf); tool.run(arr); fs.delete(new Path(indexPath + "/medline-docid-tmp"), true); } // Now we're ready to start the preprocessing pipeline... set // appropriate properties. conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.CollectionName", "Medline"); conf.set("Ivory.CollectionPath", collection); conf.set("Ivory.IndexPath", indexPath); conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer"); conf.set("Ivory.InputFormat", "edu.umd.cloud9.collection.medline.MedlineCitationInputFormat"); conf.set("Ivory.DocnoMappingFile", indexPath + "docno.mapping"); conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.medline.MedlineDocnoMapping"); conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1 conf.setInt("Ivory.MinDf", 2); // toss away singleton terms conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE); conf.setInt("Ivory.TermIndexWindow", 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.driver.PreprocessTREC.java
License:Apache License
/** * Runs this tool./* w ww .j a v a2 s . c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; int numMappers = Integer.parseInt(args[2]); int numReducers = Integer.parseInt(args[3]); sLogger.info("Tool name: PreprocessTREC"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { sLogger.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { sLogger.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), new Integer(numMappers).toString() }; NumberTrecDocuments tool = new NumberTrecDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } // Now we're ready to start the preprocessing pipeline... set // appropriate properties. conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.CollectionName", "TREC_vol45"); conf.set("Ivory.CollectionPath", collection); conf.set("Ivory.IndexPath", indexRootPath); conf.set("Ivory.InputFormat", "edu.umd.cloud9.collection.trec.TrecDocumentInputFormat"); conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer"); conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trec.TrecDocnoMapping"); conf.set("Ivory.DocnoMappingFile", env.getDocnoMappingData().toString()); conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1 conf.setInt("Ivory.MinDf", 2); // toss away singleton terms conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE); conf.setInt("Ivory.TermIndexWindow", 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.driver.PreprocessWt10g.java
License:Apache License
/** * Runs this tool./*from w ww . j a v a 2 s. c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; int numMappers = Integer.parseInt(args[2]); int numReducers = Integer.parseInt(args[3]); sLogger.info("Tool name: PreprocessWt10g"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { sLogger.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { sLogger.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), new Integer(numMappers).toString() }; NumberTrecWebDocuments tool = new NumberTrecWebDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } // Now we're ready to start the preprocessing pipeline... set // appropriate properties. conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.CollectionName", "Wt10g"); conf.set("Ivory.CollectionPath", collection); conf.set("Ivory.IndexPath", indexRootPath); conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat"); conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer"); conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trecweb.Wt10gDocnoMapping"); conf.set("Ivory.DocnoMappingFile", mappingFile.toString()); conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1 conf.setInt("Ivory.MinDf", 10); conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE); conf.setInt("Ivory.TermIndexWindow", 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.index.BuildIPInvertedIndexDocSorted.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); int collectionDocCnt = env.readCollectionDocumentCount(); LOG.info("PowerTool: BuildIPInvertedIndexDocSorted"); LOG.info(" - IndexPath: " + indexPath); LOG.info(" - CollectionName: " + collectionName); LOG.info(" - CollectionDocumentCount: " + collectionDocCnt); LOG.info(" - NumMapTasks: " + mapTasks); LOG.info(" - NumReduceTasks: " + reduceTasks); LOG.info(" - MinSplitSize: " + minSplitSize); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }/*w w w. j av a2 s . co m*/ Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setJobName("BuildIPInvertedIndex:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, postingsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(TermPositions.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PostingsListDocSortedPositional.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType("ivory.data.PostingsListDocSortedPositional"); return 0; }
From source file:jadoop.HadoopGridJob.java
License:Open Source License
/** * Creates a temporary working directory on the hadoop HDFS for the job that * will be running. The name of this temporary directory will be the name * given to the job. If there is an existing directory with the same name as * the job's name, this method generate a new name that will be used so that * the temporary directory does not share a name with another directory on * the HDFS.//from w ww. j a va2 s . c o m * * @return the path of the new temporary working directory on the HDFS * * @throws IOException * if there is a problem creating the temporary working * directory. */ private Path createTemporaryDirectory(FileSystem fs) throws IOException { // path to the HDFS system Path hdfsHome = fs.getHomeDirectory(); // base name of the temporary working directory. Path newHDFSDir = new Path("/" + jobName); // full path to the temporary working directory on the HDFS. Path tempHDFSWorkingDir = Path.mergePaths(hdfsHome, newHDFSDir); // append numbers to the job name until there is no conflict... int number = 1; while (fs.exists(tempHDFSWorkingDir)) { Path jobNum = new Path("/" + jobName + number); tempHDFSWorkingDir = Path.mergePaths(hdfsHome, jobNum); number++; } // make the directory on the HDFS and return the path to it. fs.mkdirs(tempHDFSWorkingDir); return tempHDFSWorkingDir; }
From source file:jadoop.HadoopGridJob.java
License:Open Source License
/** * Creates a directory named "input" in the temporary working directory on * the hadoop HDFS./*w w w. j a v a2s. c o m*/ * * @param fs * the hadoop HDFS file system * @param hdfsDirectory * path to the temporary working directory on the HDFS in which * the input directory is to be created. * * @return a path to the input directory that was created. * * @throws IOException * if there is a problem creating the input directory. */ private Path createInputDirectory(FileSystem fs, Path hdfsDirectory) throws IOException { String IN_DIR = hdfsDirectory.toString() + "/input"; Path inDir = new Path(IN_DIR); fs.mkdirs(inDir); return inDir; }
From source file:kdp.jobcontrol.ControlledJob.java
License:Apache License
/** * Submit this job to mapred. The state becomes RUNNING if submission is * successful, FAILED otherwise.// w w w.jav a 2 s. co m */ protected synchronized void submit() { try { Configuration conf = job.getConfiguration(); if (conf.getBoolean(CREATE_DIR, false)) { FileSystem fs = FileSystem.get(conf); Path inputPaths[] = FileInputFormat.getInputPaths(job); for (int i = 0; i < inputPaths.length; i++) { if (!fs.exists(inputPaths[i])) { try { fs.mkdirs(inputPaths[i]); } catch (IOException e) { } } } } if (requiredCounters != null) { for (RequiredCounter counter : requiredCounters) { conf.set(counter.getPropertyName(), Long.toString(counter.getCounter())); } } job.submit(); this.state = State.RUNNING; } catch (Exception ioe) { this.state = State.FAILED; this.message = StringUtils.stringifyException(ioe); } }
From source file:kogiri.common.json.JsonSerializer.java
License:Open Source License
public void toJsonFile(FileSystem fs, Path file, Object obj) throws IOException { if (!fs.exists(file.getParent())) { fs.mkdirs(file.getParent()); }/*from ww w . ja v a 2s . c o m*/ DataOutputStream ostream = fs.create(file, true, 64 * 1024, (short) 3, 1024 * 1024); this.mapper.writeValue(ostream, obj); ostream.close(); }
From source file:kogiri.mapreduce.preprocess.indexing.stage2.KmerIndexBuilder.java
License:Open Source License
private void commitRoundIndexOutputFiles(Path roundInputPath, Path MROutputPath, Path finalOutputPath, Configuration conf, int kmerSize) throws IOException { FileSystem fs = MROutputPath.getFileSystem(conf); if (!fs.exists(finalOutputPath)) { fs.mkdirs(finalOutputPath); }// w ww .jav a2 s . co m FileStatus status = fs.getFileStatus(MROutputPath); if (status.isDir()) { FileStatus[] entries = fs.listStatus(MROutputPath); for (FileStatus entry : entries) { Path entryPath = entry.getPath(); // remove unnecessary outputs if (MapReduceHelper.isLogFiles(entryPath)) { fs.delete(entryPath, true); } else if (MapReduceHelper.isPartialOutputFiles(entryPath)) { // rename outputs int mapreduceID = MapReduceHelper.getMapReduceID(entryPath); Path toPath = new Path(finalOutputPath, KmerIndexHelper .makeKmerIndexPartFileName(roundInputPath.getName(), kmerSize, mapreduceID)); LOG.info("output : " + entryPath.toString()); LOG.info("renamed to : " + toPath.toString()); fs.rename(entryPath, toPath); } } } else { throw new IOException("path not found : " + MROutputPath.toString()); } fs.delete(MROutputPath, true); }
From source file:kogiri.mapreduce.readfrequency.modecount.ModeCounter.java
License:Open Source License
private void commitRoundOutputFiles(Path MROutputPath, Path finalOutputPath, Configuration conf, NamedOutputs namedOutputs, int round) throws IOException { FileSystem fs = MROutputPath.getFileSystem(conf); if (!fs.exists(finalOutputPath)) { fs.mkdirs(finalOutputPath); }//w ww. jav a 2 s . co m NamedOutputRecord roundMasterRecord = namedOutputs.getRecordFromID(round); Path roundDestPath = new Path(finalOutputPath, roundMasterRecord.getFilename()); if (!fs.exists(roundDestPath)) { fs.mkdirs(roundDestPath); } FileStatus status = fs.getFileStatus(MROutputPath); if (status.isDir()) { FileStatus[] entries = fs.listStatus(MROutputPath); for (FileStatus entry : entries) { Path entryPath = entry.getPath(); // remove unnecessary outputs if (MapReduceHelper.isLogFiles(entryPath)) { fs.delete(entryPath, true); } else if (MapReduceHelper.isPartialOutputFiles(entryPath)) { fs.delete(entryPath, true); } else { // rename outputs NamedOutputRecord namedOutput = namedOutputs.getRecordFromMROutput(entryPath); if (namedOutput != null) { Path toPath = new Path(roundDestPath, namedOutput.getFilename() + "." + ReadFrequencyCounterConstants.READ_FREQUENCY_FILENAME_FILENAME_EXTENSION); LOG.info("output : " + entryPath.toString()); LOG.info("renamed to : " + toPath.toString()); fs.rename(entryPath, toPath); } } } } else { throw new IOException("path not found : " + MROutputPath.toString()); } fs.delete(MROutputPath, true); }