List of usage examples for org.apache.hadoop.fs FileSystem mkdirs
public boolean mkdirs(Path f) throws IOException
From source file:ivory.app.PreprocessWikipedia.java
License:Apache License
/** * Runs this tool./*from www . j av a2 s. c o m*/ */ public int run(String[] args) throws Exception { if (parseArgs(args) < 0) { printUsage(); return -1; } Configuration conf = getConf(); conf.set(Constants.Language, collectionLang); conf.setBoolean(Constants.Stemming, true); // default behavior of tokenizer is currently to stem, but we shouldnt rely on that if (tokenizerModel != null) { conf.set(Constants.TokenizerData, tokenizerModel); } // user can either provide a tokenizer class as a program argument, // or let the factory find an appropriate class based on language code try { Class.forName(tokenizerClass); } catch (Exception e) { tokenizerClass = TokenizerFactory.getTokenizerClass(collectionLang, tokenizerModel).getCanonicalName(); } if (collectionVocab != null) { conf.set(Constants.CollectionVocab, collectionVocab); // vocabulary to read collection from } if (e_stopwordList != null) { conf.set(Constants.StopwordList, e_stopwordList); conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed"); } // CROSS-LINGUAL CASE if (mode == CROSS_LINGUAL_E) { // English side conf.set("Ivory.FinalVocab", collectionVocab); // vocabulary to map terms to integers in BuildTargetLang... conf.set(Constants.StopwordList, e_stopwordList); conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed"); } if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated conf.set(Constants.TargetIndexPath, targetIndexPath); conf.set("Ivory.F_Vocab_F2E", fVocab_f2e); conf.set("Ivory.E_Vocab_F2E", eVocab_f2e); conf.set("Ivory.TTable_F2E", ttable_f2e); conf.set("Ivory.E_Vocab_E2F", eVocab_e2f); conf.set("Ivory.F_Vocab_E2F", fVocab_e2f); conf.set("Ivory.TTable_E2F", ttable_e2f); conf.set(Constants.CollectionVocab, fVocab_f2e); // vocabulary to read collection from conf.set("Ivory.FinalVocab", eVocab_f2e); // vocabulary to map terms to integers in BuildTargetLang... if (f_stopwordList != null) { conf.set(Constants.StopwordList, f_stopwordList); conf.set(Constants.StemmedStopwordList, f_stopwordList + ".stemmed"); } if (e_stopwordList != null) { conf.set(Constants.TargetStopwordList, e_stopwordList); conf.set(Constants.TargetStemmedStopwordList, e_stopwordList + ".stemmed"); } if (e_tokenizerModel != null) { conf.set(Constants.TargetTokenizer, e_tokenizerModel); } conf.set(Constants.TargetLanguage, targetLang); } int numMappers = 100; int numReducers = 100; // Print out options LOG.info("Tool name: WikipediaDriver"); LOG.info(" - Index path: " + indexRootPath); LOG.info(" - Raw collection path: " + rawCollection); LOG.info(" - Compressed collection path: " + seqCollection); LOG.info(" - Collection language: " + collectionLang); LOG.info(" - Tokenizer class: " + tokenizerClass); LOG.info(" - Tokenizer model: " + tokenizerModel); LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle); LOG.info(" - Stopwords file: " + e_stopwordList); if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side."); LOG.info(" - Collection vocab file: " + conf.get(Constants.CollectionVocab)); LOG.info(" - Tokenizer model: " + tokenizerModel); if (mode == CROSS_LINGUAL_F) { LOG.info(" - TTable file " + collectionLang + " --> " + targetLang + " : " + ttable_f2e); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); LOG.info(" - TTable file " + targetLang + " --> " + collectionLang + " : " + ttable_e2f); LOG.info(" - Source vocab file: " + eVocab_e2f); LOG.info(" - Target vocab file: " + fVocab_e2f); LOG.info(" - Source stopwords file: " + f_stopwordList); LOG.info(" - Target stopwords file: " + e_stopwordList); LOG.info(" - Target stemmed stopwords file: " + conf.get(Constants.TargetStemmedStopwordList)); LOG.info(" - Target tokenizer path: " + e_tokenizerModel); } } FileSystem fs = FileSystem.get(conf); Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("Index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Build docno mapping from raw collection Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output_file=" + mappingFile.toString(), "-wiki_language=" + collectionLang }; LOG.info("Running WikipediaDocnoMappingBuilder with args " + Arrays.toString(arr)); WikipediaDocnoMappingBuilder tool = new WikipediaDocnoMappingBuilder(); tool.setConf(conf); tool.run(arr); fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true); } else { LOG.info("Docno mapping already exists at: " + mappingFile); } // Repack Wikipedia into sequential compressed block if (!fs.exists(new Path(seqCollection + "/part-00000"))) { LOG.info(seqCollection + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection, "-mapping_file=" + mappingFile.toString(), "-compression_type=block", "-wiki_language=" + collectionLang }; LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr)); RepackWikipedia tool = new RepackWikipedia(); tool.setConf(conf); tool.run(arr); } else { LOG.info("Repacked collection already exists at: " + seqCollection); } conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang); conf.setInt(Constants.NumMapTasks, numMappers); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.CollectionPath, seqCollection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName()); conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer" conf.setInt(Constants.MinDf, MinDF); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.TermIndexWindow, TermIndexWindow); // Builds term doc vectors from document collection, and filters the terms that are not included // in Ivory.SrcVocab. long startTime = System.currentTimeMillis(); long preprocessStartTime = System.currentTimeMillis(); LOG.info("Building term doc vectors..."); int exitCode = new BuildTermDocVectors(conf).run(); if (exitCode >= 0) { LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildTermDocVectors. Terminating..."); return -1; } // Get CF and DF counts. startTime = System.currentTimeMillis(); LOG.info("Counting terms..."); exitCode = new ComputeGlobalTermStatistics(conf).run(); LOG.info("TermCount = " + env.readCollectionTermCount()); if (exitCode >= 0) { LOG.info("Job ComputeGlobalTermStatistics finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: ComputeGlobalTermStatistics. Terminating..."); return -1; } // Build a map from terms to sequentially generated integer term ids. startTime = System.currentTimeMillis(); LOG.info("Building term-to-integer id mapping..."); exitCode = new BuildDictionary(conf).run(); if (exitCode >= 0) { LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildDictionary. Terminating..."); return -1; } // Compute term weights, and output weighted term doc vectors. LOG.info("Building weighted term doc vectors..."); startTime = System.currentTimeMillis(); conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25"); conf.setBoolean("Ivory.Normalize", IsNormalized); conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); if (mode == CROSS_LINGUAL_F) { // Translate term doc vectors into English. exitCode = new BuildTranslatedTermDocVectors(conf).run(); } else { // Build weighted term doc vectors. exitCode = new BuildWeightedTermDocVectors(conf).run(); } if (exitCode >= 0) { LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating..."); return -1; } // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency startTime = System.currentTimeMillis(); LOG.info("Building weighted integer doc vectors..."); conf.setBoolean("Ivory.Normalize", IsNormalized); if (mode == MONO_LINGUAL) { exitCode = new BuildIntDocVectors(conf).run(); exitCode = new BuildWeightedIntDocVectors(conf).run(); if (exitCode >= 0) { LOG.info("Job BuildWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildWeightedIntDocVectors. Terminating..."); return -1; } } else { BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors( conf); int finalNumDocs = weightedIntVectorsTool.run(); LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); if (finalNumDocs > 0) { LOG.info("Changed doc count: " + env.readCollectionDocumentCount() + " => " + finalNumDocs); env.writeCollectionDocumentCount(finalNumDocs); } else { LOG.info("No document output! Terminating..."); return -1; } // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures. Vocab engVocabH = null; try { engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf); } catch (IOException e) { e.printStackTrace(); } LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size()); env.writeCollectionTermCount(engVocabH.size()); } LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.core.driver.PreprocessAquaint2.java
License:Apache License
/** * Runs this tool.//from w w w .ja v a2 s . c om */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; LOG.info("Tool name: " + PreprocessAquaint2.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { fs.mkdirs(p); } else { LOG.info("Index directory already exists, skipping!"); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.getXmlStartTag(fs, collection)); conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.getXmlEndTag()); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() }; NumberAquaint2Documents2 tool = new NumberAquaint2Documents2(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } else { LOG.info("DocnoMapping already exists, skipping!"); } Aquaint2DocnoMapping dm = new Aquaint2DocnoMapping(); dm.loadMapping(mappingFile, fs); int docno; int expectedDocno; String expectedDocid; String docid; boolean testAquaint2 = false; if (testAquaint2) { docno = 500; expectedDocid = "AFP_ENG_20041001.0500"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); docno = 600; expectedDocid = "AFP_ENG_20041001.0600"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); docno = 700; expectedDocid = "AFP_ENG_20041001.0701"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); docno = 800; expectedDocid = "AFP_ENG_20041003.0019"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); expectedDocno = 500; docid = "AFP_ENG_20041001.0500"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); expectedDocno = 600; docid = "AFP_ENG_20041001.0600"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); expectedDocno = 700; docid = "AFP_ENG_20041001.0701"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); expectedDocno = 800; docid = "AFP_ENG_20041003.0019"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); System.out.println("finished testing, now exiting"); return 0; } boolean testGigaword = false; if (testGigaword) { for (int i = 1; i < 301; i++) { docno = i * 1000; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid); } System.out.println("finished testing, now exiting"); return 0; } conf.set(Constants.CollectionName, "Aquaint2"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, Aquaint2DocumentInputFormat2.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, Aquaint2DocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 2); // toss away singleton terms conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); //new BuildTermDocVectorsForwardIndex(conf).run(); new BuildIPInvertedIndexDocSorted(conf).run(); conf.set(Constants.ScoringModel, "ivory.pwsim.score.TfIdf"); conf.setBoolean(Constants.Normalize, true); new BuildIntPostingsForwardIndex(conf).run(); boolean buildingVectors = true; //boolean buildingVectors = false; if (buildingVectors) { //new BuildWeightedIntDocVectors(conf).run(); //conf.setBoolean(Constants.BuildWeighted, true); //new BuildIntDocVectorsForwardIndex(conf).run(); String findexDirPath = indexRootPath + "/findex"; String findexFilePath = indexRootPath + "/findex.dat"; if (fs.exists(new Path(findexDirPath))) { LOG.info("ForwardIndex already exists: Skipping!"); } else { new BuildAquaint2ForwardIndex().runTool(conf, collection, findexDirPath, findexFilePath, mappingFile.toString()); } } return 0; }
From source file:ivory.core.driver.PreprocessGov2.java
License:Apache License
/** * Runs this tool./*from w w w .j a va2 s . c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; LOG.info("Tool name: " + PreprocessGov2.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { LOG.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), "100" }; NumberTrecWebDocuments tool = new NumberTrecWebDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } conf.set(Constants.CollectionName, "Gov2"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, Gov2DocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, mappingFile.toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 10); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.driver.PreprocessMedline.java
License:Apache License
/** * Runs this tool./*from w ww. jav a 2 s . com*/ */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexPath = args[1]; LOG.info("Tool name: ProcessMedline"); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.info("index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number // integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { LOG.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() }; NumberMedlineCitations2 tool = new NumberMedlineCitations2(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } conf.set(Constants.CollectionName, "Medline"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, MedlineCitationInputFormat2.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, MedlineDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 2); // toss away singleton terms conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.driver.PreprocessTREC.java
License:Apache License
/** * Runs this tool.//from w ww .j a va2 s . c o m */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; LOG.info("Tool name: " + PreprocessTREC.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number // integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { LOG.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() }; NumberTrecDocuments2 tool = new NumberTrecDocuments2(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } conf.set(Constants.CollectionName, "TREC_vol45"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, TrecDocumentInputFormat2.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 2); // toss away singleton terms conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.driver.PreprocessWikipedia.java
License:Apache License
/** * Runs this tool./*from www . j ava 2s. c o m*/ */ public int run(String[] args) throws Exception { int mode = args.length; if (mode != MONO_LINGUAL && mode != CROSS_LINGUAL_E && mode != CROSS_LINGUAL_F) { printUsage(); return -1; } String indexRootPath = args[0]; String rawCollection = args[1]; //"/shared/Wikipedia/raw/dewiki-20100117-pages-articles.xml"; String seqCollection = args[2]; //"/umd-lin/fture/pwsim/de-wikipedia/compressed.block/de-20100117"; String tokenizerClass = args[3]; Configuration conf = new Configuration(); String collectionLang = null, tokenizerModel = null, collectionVocab = null; String fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null, ttable_e2f = null; if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { // CROSS-LINGUAL CASE collectionLang = args[4]; tokenizerModel = args[5]; collectionVocab = args[6]; conf.set("Ivory.Lang", collectionLang); conf.set("Ivory.TokenizerModel", tokenizerModel); conf.set("Ivory.CollectionVocab", collectionVocab); conf.set("Ivory.FinalVocab", collectionVocab); if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated fVocab_f2e = args[6]; // same as collection vocab eVocab_f2e = args[7]; ttable_f2e = args[8]; eVocab_e2f = args[9]; fVocab_e2f = args[10]; ttable_e2f = args[11]; conf.set("Ivory.F_Vocab_F2E", fVocab_f2e); conf.set("Ivory.E_Vocab_F2E", eVocab_f2e); conf.set("Ivory.TTable_F2E", ttable_f2e); conf.set("Ivory.E_Vocab_E2F", eVocab_e2f); conf.set("Ivory.F_Vocab_E2F", fVocab_e2f); conf.set("Ivory.TTable_E2F", ttable_e2f); conf.set("Ivory.FinalVocab", eVocab_e2f); } } int numMappers = 100; int numReducers = 100; LOG.info("Tool name: WikipediaDriver"); LOG.info(" - Index path: " + indexRootPath); LOG.info(" - Raw collection path: " + rawCollection); LOG.info(" - Compressed collection path: " + seqCollection); LOG.info(" - Tokenizer class: " + tokenizerClass); LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle); if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side."); LOG.info(" - Collection vocab file: " + collectionVocab); LOG.info(" - Tokenizer model: " + tokenizerModel); if (mode == CROSS_LINGUAL_F) { LOG.info(" - TTable file " + collectionLang + " --> English : " + ttable_f2e); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); LOG.info(" - TTable file " + "English --> " + collectionLang + " : " + ttable_e2f); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); } } LOG.info("Launching with " + numMappers + " mappers, " + numReducers + " reducers..."); FileSystem fs = FileSystem.get(conf); Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("Index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Build docno mapping from raw collection Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output_path=" + indexRootPath + "/wiki-docid-tmp", "-output_file=" + mappingFile.toString(), "-keep_all=false" }; BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping(); tool.setConf(conf); tool.run(arr); fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true); } else { LOG.info(p + " exists"); } // Repack Wikipedia into sequential compressed block p = new Path(seqCollection); if (!fs.exists(p)) { LOG.info(seqCollection + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection, "-mapping_file=" + mappingFile.toString(), "-compression_type=block", "-wiki_language=" + collectionLang }; RepackWikipedia tool = new RepackWikipedia(); tool.setConf(conf); tool.run(arr); } conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang); conf.setInt(Constants.NumMapTasks, numMappers); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.CollectionPath, seqCollection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName()); conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer" conf.setInt(Constants.MinDf, MinDF); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.TermIndexWindow, TermIndexWindow); // Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab long startTime = System.currentTimeMillis(); long preprocessStartTime = System.currentTimeMillis(); LOG.info("Building term doc vectors..."); new BuildTermDocVectors(conf).run(); LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Get CF and DF counts startTime = System.currentTimeMillis(); LOG.info("Counting terms..."); new ComputeGlobalTermStatistics(conf).run(); LOG.info("TermCount = " + env.readCollectionTermCount() + "\nJob finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Build a map from terms to sequentially generated integer term ids startTime = System.currentTimeMillis(); LOG.info("Building term-to-integer id mapping..."); new BuildDictionary(conf).run(); LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Compute term weights, and output weighted term doc vectors startTime = System.currentTimeMillis(); LOG.info("Building weighted term doc vectors..."); conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25"); if (mode == CROSS_LINGUAL_F) { conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); // translate term doc vectors into English. conf.setBoolean("Ivory.Normalize", true); new BuildTranslatedTermDocVectors(conf).run(); } else { conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); // get weighted term doc vectors conf.setBoolean("Ivory.Normalize", true); new BuildWeightedTermDocVectors(conf).run(); } LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency startTime = System.currentTimeMillis(); LOG.info("Building weighted integer doc vectors..."); conf.setBoolean("Ivory.Normalize", IsNormalized); if (mode == MONO_LINGUAL) { new BuildIntDocVectors(conf).run(); new BuildWeightedIntDocVectors(conf).run(); LOG.info("Job BuildWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors( conf); LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); int finalNumDocs = weightedIntVectorsTool.run(); if (finalNumDocs > 0) { LOG.info("Changed doc count from " + env.readCollectionDocumentCount() + " to = " + finalNumDocs); env.writeCollectionDocumentCount(finalNumDocs); } // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures. Vocab engVocabH = null; try { engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf); } catch (IOException e) { e.printStackTrace(); } LOG.info("Changed term count to : " + env.readCollectionTermCount() + " = " + engVocabH.size()); env.writeCollectionTermCount(engVocabH.size()); } LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.core.driver.PreprocessWt10g.java
License:Apache License
/** * Runs this tool.// www .j av a 2 s. c om */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; LOG.info("Tool name: " + PreprocessWt10g.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number // integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { LOG.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), "100" }; NumberTrecWebDocuments tool = new NumberTrecWebDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } conf.set(Constants.CollectionName, "Wt10g"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, Wt10gDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, mappingFile.toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 10); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.index.BuildIPInvertedIndexDocSorted.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); int minSplitSize = conf.getInt(Constants.MinSplitSize, 0); int collectionDocCnt = env.readCollectionDocumentCount(); String postingsType = conf.get(Constants.PostingsListsType, ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName()); @SuppressWarnings("unchecked") Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType); LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt)); LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName())); LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks)); LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize)); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }//from ww w. ja v a 2 s . co m Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); Job job = new Job(conf, BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName); job.setJarByClass(BuildIPInvertedIndexDocSorted.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, postingsPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(PairOfInts.class); job.setMapOutputValueClass(TermPositions.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(postingsClass); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType(postingsClass.getCanonicalName()); return 0; }
From source file:ivory.core.index.BuildLPInvertedIndexDocSorted.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); int minSplitSize = conf.getInt(Constants.MinSplitSize, 0); int collectionDocCount = env.readCollectionDocumentCount(); String postingsType = conf.get(Constants.PostingsListsType, PostingsListDocSortedPositional.class.getCanonicalName()); @SuppressWarnings("unchecked") Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType); // These are the default values for the LP algorithm. float mapMemoryThreshold = conf.getFloat(Constants.IndexingMapMemoryThreshold, 0.9f); float reduceMemoryThreshold = conf.getFloat(Constants.IndexingReduceMemoryThreshold, 0.9f); int maxHeap = conf.getInt(Constants.MaxHeap, 2048); int maxNDocsBeforeFlush = conf.getInt(Constants.MaxNDocsBeforeFlush, 50000); LOG.info("PowerTool: " + BuildLPInvertedIndexDocSorted.class.getSimpleName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCount)); LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName())); LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks)); LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize)); LOG.info(String.format(" - %s: %s", Constants.IndexingMapMemoryThreshold, mapMemoryThreshold)); LOG.info(String.format(" - %s: %s", Constants.IndexingReduceMemoryThreshold, reduceMemoryThreshold)); LOG.info(String.format(" - %s: %s", Constants.MaxHeap, maxHeap)); LOG.info(String.format(" - %s: %s", Constants.MaxNDocsBeforeFlush, maxNDocsBeforeFlush)); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }//from w ww . j a v a 2 s .c o m Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.setInt("mapred.min.split.size", minSplitSize); //conf.set("mapred.child.java.opts", "-Xmx" + maxHeap + "m"); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); Job job = Job.getInstance(conf, BuildLPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName); job.setJarByClass(BuildLPInvertedIndexDocSorted.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, postingsPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PostingsListDocSortedPositional.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PostingsListDocSortedPositional.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType("ivory.data.PostingsListDocSortedPositional"); return 0; }
From source file:ivory.driver.PreprocessGov2.java
License:Apache License
/** * Runs this tool./*from ww w. j ava 2s. co m*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; int numMappers = Integer.parseInt(args[2]); int numReducers = Integer.parseInt(args[3]); sLogger.info("Tool name: PreprocessGov2"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { sLogger.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { sLogger.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), new Integer(numMappers).toString() }; NumberTrecWebDocuments tool = new NumberTrecWebDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } // Now we're ready to start the preprocessing pipeline... set // appropriate properties. conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.CollectionName", "Gov2"); conf.set("Ivory.CollectionPath", collection); conf.set("Ivory.IndexPath", indexRootPath); conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat"); conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer"); conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trecweb.Gov2DocnoMapping"); conf.set("Ivory.DocnoMappingFile", mappingFile.toString()); conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1 conf.setInt("Ivory.MinDf", 10); conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE); conf.setInt("Ivory.TermIndexWindow", 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }