Example usage for org.apache.hadoop.fs FileSystem mkdirs

List of usage examples for org.apache.hadoop.fs FileSystem mkdirs

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem mkdirs.

Prototype

public boolean mkdirs(Path f) throws IOException 

Source Link

Document

Call #mkdirs(Path,FsPermission) with default permission.

Usage

From source file:ivory.app.PreprocessWikipedia.java

License:Apache License

/**
 * Runs this tool./*from  www .  j  av a2  s. c o m*/
 */
public int run(String[] args) throws Exception {
    if (parseArgs(args) < 0) {
        printUsage();
        return -1;
    }
    Configuration conf = getConf();

    conf.set(Constants.Language, collectionLang);
    conf.setBoolean(Constants.Stemming, true); // default behavior of tokenizer is currently to stem, but we shouldnt rely on that

    if (tokenizerModel != null) {
        conf.set(Constants.TokenizerData, tokenizerModel);
    }

    // user can either provide a tokenizer class as a program argument, 
    // or let the factory find an appropriate class based on language code
    try {
        Class.forName(tokenizerClass);
    } catch (Exception e) {
        tokenizerClass = TokenizerFactory.getTokenizerClass(collectionLang, tokenizerModel).getCanonicalName();
    }

    if (collectionVocab != null) {
        conf.set(Constants.CollectionVocab, collectionVocab); // vocabulary to read collection from
    }
    if (e_stopwordList != null) {
        conf.set(Constants.StopwordList, e_stopwordList);
        conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed");
    }
    // CROSS-LINGUAL CASE
    if (mode == CROSS_LINGUAL_E) { // English side
        conf.set("Ivory.FinalVocab", collectionVocab); // vocabulary to map terms to integers in BuildTargetLang...
        conf.set(Constants.StopwordList, e_stopwordList);
        conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed");
    }

    if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated
        conf.set(Constants.TargetIndexPath, targetIndexPath);
        conf.set("Ivory.F_Vocab_F2E", fVocab_f2e);
        conf.set("Ivory.E_Vocab_F2E", eVocab_f2e);
        conf.set("Ivory.TTable_F2E", ttable_f2e);
        conf.set("Ivory.E_Vocab_E2F", eVocab_e2f);
        conf.set("Ivory.F_Vocab_E2F", fVocab_e2f);
        conf.set("Ivory.TTable_E2F", ttable_e2f);
        conf.set(Constants.CollectionVocab, fVocab_f2e); // vocabulary to read collection from
        conf.set("Ivory.FinalVocab", eVocab_f2e); // vocabulary to map terms to integers in BuildTargetLang...
        if (f_stopwordList != null) {
            conf.set(Constants.StopwordList, f_stopwordList);
            conf.set(Constants.StemmedStopwordList, f_stopwordList + ".stemmed");
        }
        if (e_stopwordList != null) {
            conf.set(Constants.TargetStopwordList, e_stopwordList);
            conf.set(Constants.TargetStemmedStopwordList, e_stopwordList + ".stemmed");
        }
        if (e_tokenizerModel != null) {
            conf.set(Constants.TargetTokenizer, e_tokenizerModel);
        }
        conf.set(Constants.TargetLanguage, targetLang);
    }

    int numMappers = 100;
    int numReducers = 100;

    // Print out options
    LOG.info("Tool name: WikipediaDriver");
    LOG.info(" - Index path: " + indexRootPath);
    LOG.info(" - Raw collection path: " + rawCollection);
    LOG.info(" - Compressed collection path: " + seqCollection);
    LOG.info(" - Collection language: " + collectionLang);
    LOG.info(" - Tokenizer class: " + tokenizerClass);
    LOG.info(" - Tokenizer model: " + tokenizerModel);
    LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle);
    LOG.info(" - Stopwords file: " + e_stopwordList);

    if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) {
        LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side.");
        LOG.info(" - Collection vocab file: " + conf.get(Constants.CollectionVocab));
        LOG.info(" - Tokenizer model: " + tokenizerModel);

        if (mode == CROSS_LINGUAL_F) {
            LOG.info(" - TTable file " + collectionLang + " --> " + targetLang + " : " + ttable_f2e);
            LOG.info(" - Source vocab file: " + fVocab_f2e);
            LOG.info(" - Target vocab file: " + eVocab_f2e);
            LOG.info(" - TTable file " + targetLang + " --> " + collectionLang + " : " + ttable_e2f);
            LOG.info(" - Source vocab file: " + eVocab_e2f);
            LOG.info(" - Target vocab file: " + fVocab_e2f);
            LOG.info(" - Source stopwords file: " + f_stopwordList);
            LOG.info(" - Target stopwords file: " + e_stopwordList);
            LOG.info(" - Target stemmed stopwords file: " + conf.get(Constants.TargetStemmedStopwordList));
            LOG.info(" - Target tokenizer path: " + e_tokenizerModel);
        }
    }

    FileSystem fs = FileSystem.get(conf);

    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("Index path doesn't exist, creating...");
        fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
        LOG.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection, "-output_file=" + mappingFile.toString(),
                "-wiki_language=" + collectionLang };
        LOG.info("Running WikipediaDocnoMappingBuilder with args " + Arrays.toString(arr));

        WikipediaDocnoMappingBuilder tool = new WikipediaDocnoMappingBuilder();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
        LOG.info("Docno mapping already exists at: " + mappingFile);
    }

    // Repack Wikipedia into sequential compressed block
    if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
        LOG.info(seqCollection + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection,
                "-mapping_file=" + mappingFile.toString(), "-compression_type=block",
                "-wiki_language=" + collectionLang };
        LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

        RepackWikipedia tool = new RepackWikipedia();
        tool.setConf(conf);
        tool.run(arr);
    } else {
        LOG.info("Repacked collection already exists at: " + seqCollection);
    }

    conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt(Constants.MinDf, MinDF);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.TermIndexWindow, TermIndexWindow);

    // Builds term doc vectors from document collection, and filters the terms that are not included
    // in Ivory.SrcVocab.
    long startTime = System.currentTimeMillis();
    long preprocessStartTime = System.currentTimeMillis();
    LOG.info("Building term doc vectors...");
    int exitCode = new BuildTermDocVectors(conf).run();
    if (exitCode >= 0) {
        LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0
                + " seconds");
    } else {
        LOG.info("Error: BuildTermDocVectors. Terminating...");
        return -1;
    }

    // Get CF and DF counts.
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    exitCode = new ComputeGlobalTermStatistics(conf).run();
    LOG.info("TermCount = " + env.readCollectionTermCount());
    if (exitCode >= 0) {
        LOG.info("Job ComputeGlobalTermStatistics finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    } else {
        LOG.info("Error: ComputeGlobalTermStatistics. Terminating...");
        return -1;
    }
    // Build a map from terms to sequentially generated integer term ids.
    startTime = System.currentTimeMillis();
    LOG.info("Building term-to-integer id mapping...");
    exitCode = new BuildDictionary(conf).run();
    if (exitCode >= 0) {
        LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0
                + " seconds");
    } else {
        LOG.info("Error: BuildDictionary. Terminating...");
        return -1;
    }

    // Compute term weights, and output weighted term doc vectors.
    LOG.info("Building weighted term doc vectors...");
    startTime = System.currentTimeMillis();

    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

    if (mode == CROSS_LINGUAL_F) {
        // Translate term doc vectors into English.
        exitCode = new BuildTranslatedTermDocVectors(conf).run();
    } else {
        // Build weighted term doc vectors.
        exitCode = new BuildWeightedTermDocVectors(conf).run();
    }
    if (exitCode >= 0) {
        LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    } else {
        LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating...");
        return -1;
    }

    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if (mode == MONO_LINGUAL) {
        exitCode = new BuildIntDocVectors(conf).run();
        exitCode = new BuildWeightedIntDocVectors(conf).run();
        if (exitCode >= 0) {
            LOG.info("Job BuildWeightedIntDocVectors finished in "
                    + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        } else {
            LOG.info("Error: BuildWeightedIntDocVectors. Terminating...");
            return -1;
        }
    } else {
        BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(
                conf);

        int finalNumDocs = weightedIntVectorsTool.run();

        LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        if (finalNumDocs > 0) {
            LOG.info("Changed doc count: " + env.readCollectionDocumentCount() + " => " + finalNumDocs);
            env.writeCollectionDocumentCount(finalNumDocs);
        } else {
            LOG.info("No document output! Terminating...");
            return -1;
        }
        // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
        Vocab engVocabH = null;
        try {
            engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
        env.writeCollectionTermCount(engVocabH.size());
    }

    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0
            + " seconds");

    return 0;
}

From source file:ivory.core.driver.PreprocessAquaint2.java

License:Apache License

/**
 * Runs this tool.//from   w  w  w .ja  v  a2 s  . c  om
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];

    LOG.info("Tool name: " + PreprocessAquaint2.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        fs.mkdirs(p);
    } else {
        LOG.info("Index directory already exists, skipping!");
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
    conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.getXmlStartTag(fs, collection));
    conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.getXmlEndTag());

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() };
        NumberAquaint2Documents2 tool = new NumberAquaint2Documents2();
        tool.setConf(conf);
        tool.run(arr);
        fs.delete(mappingDir, true);
    } else {
        LOG.info("DocnoMapping already exists, skipping!");
    }
    Aquaint2DocnoMapping dm = new Aquaint2DocnoMapping();
    dm.loadMapping(mappingFile, fs);

    int docno;
    int expectedDocno;
    String expectedDocid;
    String docid;
    boolean testAquaint2 = false;
    if (testAquaint2) {
        docno = 500;
        expectedDocid = "AFP_ENG_20041001.0500";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        docno = 600;
        expectedDocid = "AFP_ENG_20041001.0600";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        docno = 700;
        expectedDocid = "AFP_ENG_20041001.0701";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        docno = 800;
        expectedDocid = "AFP_ENG_20041003.0019";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        expectedDocno = 500;
        docid = "AFP_ENG_20041001.0500";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        expectedDocno = 600;
        docid = "AFP_ENG_20041001.0600";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        expectedDocno = 700;
        docid = "AFP_ENG_20041001.0701";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        expectedDocno = 800;
        docid = "AFP_ENG_20041003.0019";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        System.out.println("finished testing, now exiting");
        return 0;
    }
    boolean testGigaword = false;
    if (testGigaword) {
        for (int i = 1; i < 301; i++) {
            docno = i * 1000;
            docid = dm.getDocid(docno);
            System.out.println("dm.getDocid(" + docno + "): " + docid);
        }
        System.out.println("finished testing, now exiting");
        return 0;
    }

    conf.set(Constants.CollectionName, "Aquaint2");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, Aquaint2DocumentInputFormat2.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, Aquaint2DocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();

    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    //new BuildTermDocVectorsForwardIndex(conf).run();

    new BuildIPInvertedIndexDocSorted(conf).run();

    conf.set(Constants.ScoringModel, "ivory.pwsim.score.TfIdf");
    conf.setBoolean(Constants.Normalize, true);

    new BuildIntPostingsForwardIndex(conf).run();

    boolean buildingVectors = true;
    //boolean buildingVectors = false;
    if (buildingVectors) {
        //new BuildWeightedIntDocVectors(conf).run();

        //conf.setBoolean(Constants.BuildWeighted, true);
        //new BuildIntDocVectorsForwardIndex(conf).run();

        String findexDirPath = indexRootPath + "/findex";
        String findexFilePath = indexRootPath + "/findex.dat";
        if (fs.exists(new Path(findexDirPath))) {
            LOG.info("ForwardIndex already exists: Skipping!");
        } else {
            new BuildAquaint2ForwardIndex().runTool(conf, collection, findexDirPath, findexFilePath,
                    mappingFile.toString());
        }
    }

    return 0;
}

From source file:ivory.core.driver.PreprocessGov2.java

License:Apache License

/**
 * Runs this tool./*from  w w w  .j  a  va2 s . c  o m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];

    LOG.info("Tool name: " + PreprocessGov2.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        LOG.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), "100" };
        NumberTrecWebDocuments tool = new NumberTrecWebDocuments();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "Gov2");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, Gov2DocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, mappingFile.toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 10);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.driver.PreprocessMedline.java

License:Apache License

/**
 * Runs this tool./*from  w  ww. jav  a 2 s .  com*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexPath = args[1];

    LOG.info("Tool name: ProcessMedline");
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        LOG.info("index path doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number
    // integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        LOG.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() };
        NumberMedlineCitations2 tool = new NumberMedlineCitations2();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "Medline");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, MedlineCitationInputFormat2.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, MedlineDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.driver.PreprocessTREC.java

License:Apache License

/**
 * Runs this tool.//from   w ww  .j  a va2  s . c o m
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];

    LOG.info("Tool name: " + PreprocessTREC.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number
    // integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        LOG.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() };
        NumberTrecDocuments2 tool = new NumberTrecDocuments2();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "TREC_vol45");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, TrecDocumentInputFormat2.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.driver.PreprocessWikipedia.java

License:Apache License

/**
 * Runs this tool./*from  www  . j  ava  2s. c o  m*/
 */
public int run(String[] args) throws Exception {
    int mode = args.length;
    if (mode != MONO_LINGUAL && mode != CROSS_LINGUAL_E && mode != CROSS_LINGUAL_F) {
        printUsage();
        return -1;
    }

    String indexRootPath = args[0];
    String rawCollection = args[1]; //"/shared/Wikipedia/raw/dewiki-20100117-pages-articles.xml";
    String seqCollection = args[2]; //"/umd-lin/fture/pwsim/de-wikipedia/compressed.block/de-20100117";
    String tokenizerClass = args[3];

    Configuration conf = new Configuration();

    String collectionLang = null, tokenizerModel = null, collectionVocab = null;
    String fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null,
            ttable_e2f = null;
    if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { // CROSS-LINGUAL CASE
        collectionLang = args[4];
        tokenizerModel = args[5];
        collectionVocab = args[6];
        conf.set("Ivory.Lang", collectionLang);
        conf.set("Ivory.TokenizerModel", tokenizerModel);
        conf.set("Ivory.CollectionVocab", collectionVocab);
        conf.set("Ivory.FinalVocab", collectionVocab);

        if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated
            fVocab_f2e = args[6]; //  same as collection vocab
            eVocab_f2e = args[7];
            ttable_f2e = args[8];
            eVocab_e2f = args[9];
            fVocab_e2f = args[10];
            ttable_e2f = args[11];

            conf.set("Ivory.F_Vocab_F2E", fVocab_f2e);
            conf.set("Ivory.E_Vocab_F2E", eVocab_f2e);
            conf.set("Ivory.TTable_F2E", ttable_f2e);
            conf.set("Ivory.E_Vocab_E2F", eVocab_e2f);
            conf.set("Ivory.F_Vocab_E2F", fVocab_e2f);
            conf.set("Ivory.TTable_E2F", ttable_e2f);
            conf.set("Ivory.FinalVocab", eVocab_e2f);
        }
    }

    int numMappers = 100;
    int numReducers = 100;

    LOG.info("Tool name: WikipediaDriver");
    LOG.info(" - Index path: " + indexRootPath);
    LOG.info(" - Raw collection path: " + rawCollection);
    LOG.info(" - Compressed collection path: " + seqCollection);
    LOG.info(" - Tokenizer class: " + tokenizerClass);
    LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle);

    if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) {
        LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side.");
        LOG.info(" - Collection vocab file: " + collectionVocab);
        LOG.info(" - Tokenizer model: " + tokenizerModel);

        if (mode == CROSS_LINGUAL_F) {
            LOG.info(" - TTable file " + collectionLang + " --> English : " + ttable_f2e);
            LOG.info(" - Source vocab file: " + fVocab_f2e);
            LOG.info(" - Target vocab file: " + eVocab_f2e);
            LOG.info(" - TTable file " + "English --> " + collectionLang + " : " + ttable_e2f);
            LOG.info(" - Source vocab file: " + fVocab_f2e);
            LOG.info(" - Target vocab file: " + eVocab_f2e);
        }
    }
    LOG.info("Launching with " + numMappers + " mappers, " + numReducers + " reducers...");

    FileSystem fs = FileSystem.get(conf);

    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("Index path doesn't exist, creating...");
        fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
        LOG.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection,
                "-output_path=" + indexRootPath + "/wiki-docid-tmp", "-output_file=" + mappingFile.toString(),
                "-keep_all=false" };

        BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
        LOG.info(p + " exists");
    }

    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    if (!fs.exists(p)) {
        LOG.info(seqCollection + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection,
                "-mapping_file=" + mappingFile.toString(), "-compression_type=block",
                "-wiki_language=" + collectionLang };
        RepackWikipedia tool = new RepackWikipedia();
        tool.setConf(conf);
        tool.run(arr);
    }

    conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt(Constants.MinDf, MinDF);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.TermIndexWindow, TermIndexWindow);

    // Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab
    long startTime = System.currentTimeMillis();
    long preprocessStartTime = System.currentTimeMillis();
    LOG.info("Building term doc vectors...");
    new BuildTermDocVectors(conf).run();
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Get CF and DF counts
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    new ComputeGlobalTermStatistics(conf).run();
    LOG.info("TermCount = " + env.readCollectionTermCount() + "\nJob finished in "
            + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Build a map from terms to sequentially generated integer term ids
    startTime = System.currentTimeMillis();
    LOG.info("Building term-to-integer id mapping...");
    new BuildDictionary(conf).run();
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Compute term weights, and output weighted term doc vectors
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted term doc vectors...");
    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    if (mode == CROSS_LINGUAL_F) {
        conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

        // translate term doc vectors into English. 
        conf.setBoolean("Ivory.Normalize", true);
        new BuildTranslatedTermDocVectors(conf).run();
    } else {
        conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

        // get weighted term doc vectors
        conf.setBoolean("Ivory.Normalize", true);
        new BuildWeightedTermDocVectors(conf).run();
    }
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if (mode == MONO_LINGUAL) {
        new BuildIntDocVectors(conf).run();
        new BuildWeightedIntDocVectors(conf).run();
        LOG.info("Job BuildWeightedIntDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    } else {
        BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(
                conf);
        LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        int finalNumDocs = weightedIntVectorsTool.run();
        if (finalNumDocs > 0) {
            LOG.info("Changed doc count from " + env.readCollectionDocumentCount() + " to = " + finalNumDocs);
            env.writeCollectionDocumentCount(finalNumDocs);
        }
        // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
        Vocab engVocabH = null;
        try {
            engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        LOG.info("Changed term count to : " + env.readCollectionTermCount() + " = " + engVocabH.size());
        env.writeCollectionTermCount(engVocabH.size());
    }

    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0
            + " seconds");

    return 0;
}

From source file:ivory.core.driver.PreprocessWt10g.java

License:Apache License

/**
 * Runs this tool.// www  .j av a 2  s.  c  om
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];

    LOG.info("Tool name: " + PreprocessWt10g.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number
    // integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        LOG.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), "100" };
        NumberTrecWebDocuments tool = new NumberTrecWebDocuments();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "Wt10g");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, Wt10gDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, mappingFile.toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 10);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    String postingsType = conf.get(Constants.PostingsListsType,
            ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());
    @SuppressWarnings("unchecked")
    Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType);

    LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt));
    LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }//from  ww w.  ja v  a  2 s . co  m

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job = new Job(conf, BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIPInvertedIndexDocSorted.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(TermPositions.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(postingsClass);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType(postingsClass.getCanonicalName());

    return 0;
}

From source file:ivory.core.index.BuildLPInvertedIndexDocSorted.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCount = env.readCollectionDocumentCount();

    String postingsType = conf.get(Constants.PostingsListsType,
            PostingsListDocSortedPositional.class.getCanonicalName());
    @SuppressWarnings("unchecked")
    Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType);

    // These are the default values for the LP algorithm.
    float mapMemoryThreshold = conf.getFloat(Constants.IndexingMapMemoryThreshold, 0.9f);
    float reduceMemoryThreshold = conf.getFloat(Constants.IndexingReduceMemoryThreshold, 0.9f);
    int maxHeap = conf.getInt(Constants.MaxHeap, 2048);
    int maxNDocsBeforeFlush = conf.getInt(Constants.MaxNDocsBeforeFlush, 50000);

    LOG.info("PowerTool: " + BuildLPInvertedIndexDocSorted.class.getSimpleName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCount));
    LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));
    LOG.info(String.format(" - %s: %s", Constants.IndexingMapMemoryThreshold, mapMemoryThreshold));
    LOG.info(String.format(" - %s: %s", Constants.IndexingReduceMemoryThreshold, reduceMemoryThreshold));
    LOG.info(String.format(" - %s: %s", Constants.MaxHeap, maxHeap));
    LOG.info(String.format(" - %s: %s", Constants.MaxNDocsBeforeFlush, maxNDocsBeforeFlush));

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }//from  w  ww  . j a  v  a 2  s  .c o  m

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);

    conf.setInt("mapred.min.split.size", minSplitSize);
    //conf.set("mapred.child.java.opts", "-Xmx" + maxHeap + "m");
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");

    Job job = Job.getInstance(conf, BuildLPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildLPInvertedIndexDocSorted.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PostingsListDocSortedPositional.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PostingsListDocSortedPositional.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType("ivory.data.PostingsListDocSortedPositional");

    return 0;
}

From source file:ivory.driver.PreprocessGov2.java

License:Apache License

/**
 * Runs this tool./*from  ww  w. j  ava  2s. co m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];
    int numMappers = Integer.parseInt(args[2]);
    int numReducers = Integer.parseInt(args[3]);

    sLogger.info("Tool name: PreprocessGov2");
    sLogger.info(" - Collection path: " + collection);
    sLogger.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        sLogger.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        sLogger.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(),
                new Integer(numMappers).toString() };
        NumberTrecWebDocuments tool = new NumberTrecWebDocuments();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    // Now we're ready to start the preprocessing pipeline... set
    // appropriate properties.
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);

    conf.set("Ivory.CollectionName", "Gov2");
    conf.set("Ivory.CollectionPath", collection);
    conf.set("Ivory.IndexPath", indexRootPath);
    conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat");
    conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trecweb.Gov2DocnoMapping");
    conf.set("Ivory.DocnoMappingFile", mappingFile.toString());

    conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1
    conf.setInt("Ivory.MinDf", 10);
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);
    conf.setInt("Ivory.TermIndexWindow", 8);

    new BuildTermDocVectors(conf).run();
    new GetTermCount(conf).run();
    new BuildTermIdMap(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}