Example usage for org.apache.hadoop.fs FileSystem mkdirs

List of usage examples for org.apache.hadoop.fs FileSystem mkdirs

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem mkdirs.

Prototype

public boolean mkdirs(Path f) throws IOException 

Source Link

Document

Call #mkdirs(Path,FsPermission) with default permission.

Usage

From source file:ivory.driver.PreprocessMedline.java

License:Apache License

/**
 * Runs this tool./* ww w . ja v a2s .  c om*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexPath = args[1];
    int numMappers = Integer.parseInt(args[2]);
    int numReducers = Integer.parseInt(args[3]);

    sLogger.info("Tool name: ProcessMedline");
    sLogger.info(" - Collection path: " + collection);
    sLogger.info(" - Index path: " + indexPath);

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        sLogger.info("index path doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
        sLogger.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { collection, indexPath + "/medline-docid-tmp", mappingFile.toString(),
                new Integer(numMappers).toString() };
        NumberMedlineCitations tool = new NumberMedlineCitations();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(new Path(indexPath + "/medline-docid-tmp"), true);
    }

    // Now we're ready to start the preprocessing pipeline... set
    // appropriate properties.
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);

    conf.set("Ivory.CollectionName", "Medline");
    conf.set("Ivory.CollectionPath", collection);
    conf.set("Ivory.IndexPath", indexPath);
    conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer");
    conf.set("Ivory.InputFormat", "edu.umd.cloud9.collection.medline.MedlineCitationInputFormat");
    conf.set("Ivory.DocnoMappingFile", indexPath + "docno.mapping");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.medline.MedlineDocnoMapping");

    conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1
    conf.setInt("Ivory.MinDf", 2); // toss away singleton terms
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);
    conf.setInt("Ivory.TermIndexWindow", 8);

    new BuildTermDocVectors(conf).run();
    new GetTermCount(conf).run();
    new BuildTermIdMap(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.driver.PreprocessTREC.java

License:Apache License

/**
 * Runs this tool./* w ww  .j  a  v  a2  s . c  o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];
    int numMappers = Integer.parseInt(args[2]);
    int numReducers = Integer.parseInt(args[3]);

    sLogger.info("Tool name: PreprocessTREC");
    sLogger.info(" - Collection path: " + collection);
    sLogger.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        sLogger.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        sLogger.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(),
                new Integer(numMappers).toString() };
        NumberTrecDocuments tool = new NumberTrecDocuments();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    // Now we're ready to start the preprocessing pipeline... set
    // appropriate properties.
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);

    conf.set("Ivory.CollectionName", "TREC_vol45");
    conf.set("Ivory.CollectionPath", collection);
    conf.set("Ivory.IndexPath", indexRootPath);
    conf.set("Ivory.InputFormat", "edu.umd.cloud9.collection.trec.TrecDocumentInputFormat");
    conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trec.TrecDocnoMapping");
    conf.set("Ivory.DocnoMappingFile", env.getDocnoMappingData().toString());

    conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1
    conf.setInt("Ivory.MinDf", 2); // toss away singleton terms
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);
    conf.setInt("Ivory.TermIndexWindow", 8);

    new BuildTermDocVectors(conf).run();
    new GetTermCount(conf).run();
    new BuildTermIdMap(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.driver.PreprocessWt10g.java

License:Apache License

/**
 * Runs this tool./*from  w  ww  .  j a v a  2  s.  c  o m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];
    int numMappers = Integer.parseInt(args[2]);
    int numReducers = Integer.parseInt(args[3]);

    sLogger.info("Tool name: PreprocessWt10g");
    sLogger.info(" - Collection path: " + collection);
    sLogger.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        sLogger.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        sLogger.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(),
                new Integer(numMappers).toString() };
        NumberTrecWebDocuments tool = new NumberTrecWebDocuments();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    // Now we're ready to start the preprocessing pipeline... set
    // appropriate properties.
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);

    conf.set("Ivory.CollectionName", "Wt10g");
    conf.set("Ivory.CollectionPath", collection);
    conf.set("Ivory.IndexPath", indexRootPath);
    conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat");
    conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trecweb.Wt10gDocnoMapping");
    conf.set("Ivory.DocnoMappingFile", mappingFile.toString());

    conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1
    conf.setInt("Ivory.MinDf", 10);
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);
    conf.setInt("Ivory.TermIndexWindow", 8);

    new BuildTermDocVectors(conf).run();
    new GetTermCount(conf).run();
    new BuildTermIdMap(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    LOG.info("PowerTool: BuildIPInvertedIndexDocSorted");
    LOG.info(" - IndexPath: " + indexPath);
    LOG.info(" - CollectionName: " + collectionName);
    LOG.info(" - CollectionDocumentCount: " + collectionDocCnt);
    LOG.info(" - NumMapTasks: " + mapTasks);
    LOG.info(" - NumReduceTasks: " + reduceTasks);
    LOG.info(" - MinSplitSize: " + minSplitSize);

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }/*w  w w.  j  av a2 s  .  co m*/

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setJobName("BuildIPInvertedIndex:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, postingsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(TermPositions.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PostingsListDocSortedPositional.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType("ivory.data.PostingsListDocSortedPositional");

    return 0;
}

From source file:jadoop.HadoopGridJob.java

License:Open Source License

/**
 * Creates a temporary working directory on the hadoop HDFS for the job that
 * will be running. The name of this temporary directory will be the name
 * given to the job. If there is an existing directory with the same name as
 * the job's name, this method generate a new name that will be used so that
 * the temporary directory does not share a name with another directory on
 * the HDFS.//from w ww.  j  a  va2  s  .  c  o m
 * 
 * @return the path of the new temporary working directory on the HDFS
 * 
 * @throws IOException
 *             if there is a problem creating the temporary working
 *             directory.
 */
private Path createTemporaryDirectory(FileSystem fs) throws IOException {
    // path to the HDFS system
    Path hdfsHome = fs.getHomeDirectory();

    // base name of the temporary working directory.
    Path newHDFSDir = new Path("/" + jobName);

    // full path to the temporary working directory on the HDFS.
    Path tempHDFSWorkingDir = Path.mergePaths(hdfsHome, newHDFSDir);

    // append numbers to the job name until there is no conflict...
    int number = 1;
    while (fs.exists(tempHDFSWorkingDir)) {
        Path jobNum = new Path("/" + jobName + number);
        tempHDFSWorkingDir = Path.mergePaths(hdfsHome, jobNum);
        number++;
    }

    // make the directory on the HDFS and return the path to it.
    fs.mkdirs(tempHDFSWorkingDir);
    return tempHDFSWorkingDir;
}

From source file:jadoop.HadoopGridJob.java

License:Open Source License

/**
 * Creates a directory named "input" in the temporary working directory on
 * the hadoop HDFS./*w  w w. j  a  v a2s. c o  m*/
 * 
 * @param fs
 *            the hadoop HDFS file system
 * @param hdfsDirectory
 *            path to the temporary working directory on the HDFS in which
 *            the input directory is to be created.
 * 
 * @return a path to the input directory that was created.
 * 
 * @throws IOException
 *             if there is a problem creating the input directory.
 */
private Path createInputDirectory(FileSystem fs, Path hdfsDirectory) throws IOException {
    String IN_DIR = hdfsDirectory.toString() + "/input";
    Path inDir = new Path(IN_DIR);
    fs.mkdirs(inDir);
    return inDir;
}

From source file:kdp.jobcontrol.ControlledJob.java

License:Apache License

/**
 * Submit this job to mapred. The state becomes RUNNING if submission is
 * successful, FAILED otherwise.// w w  w.jav a 2 s.  co  m
 */
protected synchronized void submit() {
    try {
        Configuration conf = job.getConfiguration();
        if (conf.getBoolean(CREATE_DIR, false)) {
            FileSystem fs = FileSystem.get(conf);
            Path inputPaths[] = FileInputFormat.getInputPaths(job);
            for (int i = 0; i < inputPaths.length; i++) {
                if (!fs.exists(inputPaths[i])) {
                    try {
                        fs.mkdirs(inputPaths[i]);
                    } catch (IOException e) {

                    }
                }
            }
        }
        if (requiredCounters != null) {
            for (RequiredCounter counter : requiredCounters) {
                conf.set(counter.getPropertyName(), Long.toString(counter.getCounter()));
            }
        }
        job.submit();
        this.state = State.RUNNING;
    } catch (Exception ioe) {
        this.state = State.FAILED;
        this.message = StringUtils.stringifyException(ioe);
    }
}

From source file:kogiri.common.json.JsonSerializer.java

License:Open Source License

public void toJsonFile(FileSystem fs, Path file, Object obj) throws IOException {
    if (!fs.exists(file.getParent())) {
        fs.mkdirs(file.getParent());
    }/*from  ww  w . ja v a  2s  . c  o  m*/

    DataOutputStream ostream = fs.create(file, true, 64 * 1024, (short) 3, 1024 * 1024);
    this.mapper.writeValue(ostream, obj);
    ostream.close();
}

From source file:kogiri.mapreduce.preprocess.indexing.stage2.KmerIndexBuilder.java

License:Open Source License

private void commitRoundIndexOutputFiles(Path roundInputPath, Path MROutputPath, Path finalOutputPath,
        Configuration conf, int kmerSize) throws IOException {
    FileSystem fs = MROutputPath.getFileSystem(conf);
    if (!fs.exists(finalOutputPath)) {
        fs.mkdirs(finalOutputPath);
    }// w  ww .jav  a2 s . co m

    FileStatus status = fs.getFileStatus(MROutputPath);
    if (status.isDir()) {
        FileStatus[] entries = fs.listStatus(MROutputPath);
        for (FileStatus entry : entries) {
            Path entryPath = entry.getPath();

            // remove unnecessary outputs
            if (MapReduceHelper.isLogFiles(entryPath)) {
                fs.delete(entryPath, true);
            } else if (MapReduceHelper.isPartialOutputFiles(entryPath)) {
                // rename outputs
                int mapreduceID = MapReduceHelper.getMapReduceID(entryPath);
                Path toPath = new Path(finalOutputPath, KmerIndexHelper
                        .makeKmerIndexPartFileName(roundInputPath.getName(), kmerSize, mapreduceID));

                LOG.info("output : " + entryPath.toString());
                LOG.info("renamed to : " + toPath.toString());
                fs.rename(entryPath, toPath);
            }
        }
    } else {
        throw new IOException("path not found : " + MROutputPath.toString());
    }

    fs.delete(MROutputPath, true);
}

From source file:kogiri.mapreduce.readfrequency.modecount.ModeCounter.java

License:Open Source License

private void commitRoundOutputFiles(Path MROutputPath, Path finalOutputPath, Configuration conf,
        NamedOutputs namedOutputs, int round) throws IOException {
    FileSystem fs = MROutputPath.getFileSystem(conf);
    if (!fs.exists(finalOutputPath)) {
        fs.mkdirs(finalOutputPath);
    }//w ww.  jav  a  2 s .  co  m

    NamedOutputRecord roundMasterRecord = namedOutputs.getRecordFromID(round);
    Path roundDestPath = new Path(finalOutputPath, roundMasterRecord.getFilename());
    if (!fs.exists(roundDestPath)) {
        fs.mkdirs(roundDestPath);
    }

    FileStatus status = fs.getFileStatus(MROutputPath);
    if (status.isDir()) {
        FileStatus[] entries = fs.listStatus(MROutputPath);
        for (FileStatus entry : entries) {
            Path entryPath = entry.getPath();

            // remove unnecessary outputs
            if (MapReduceHelper.isLogFiles(entryPath)) {
                fs.delete(entryPath, true);
            } else if (MapReduceHelper.isPartialOutputFiles(entryPath)) {
                fs.delete(entryPath, true);
            } else {
                // rename outputs
                NamedOutputRecord namedOutput = namedOutputs.getRecordFromMROutput(entryPath);
                if (namedOutput != null) {
                    Path toPath = new Path(roundDestPath, namedOutput.getFilename() + "."
                            + ReadFrequencyCounterConstants.READ_FREQUENCY_FILENAME_FILENAME_EXTENSION);

                    LOG.info("output : " + entryPath.toString());
                    LOG.info("renamed to : " + toPath.toString());
                    fs.rename(entryPath, toPath);
                }
            }
        }
    } else {
        throw new IOException("path not found : " + MROutputPath.toString());
    }

    fs.delete(MROutputPath, true);
}