List of usage examples for org.apache.hadoop.fs FileSystem mkdirs
public boolean mkdirs(Path f) throws IOException
From source file:io.gzinga.hadoop.TestSplittableGZipCodec.java
License:Apache License
@Test public void testSplittableGZipCodec() { try {/* ww w . ja v a 2s . c o m*/ Configuration conf = new Configuration(); conf.set("fs.defaultFS", "file:///"); FileSystem fs = FileSystem.get(conf); fs.mkdirs(new Path("target/test")); GZipOutputStreamRandomAccess gzip = new GZipOutputStreamRandomAccess( fs.create(new Path("target/test/testfile1.gz"))); String str = "This is line\n"; for (int i = 1; i <= 10000; i++) { gzip.write(str.getBytes()); if (i % 100 == 0) { gzip.addOffset(i / 100l); } } Assert.assertEquals(gzip.getOffsetMap().size(), 100); gzip.close(); conf.set("mapreduce.framework.name", "local"); conf.set("io.compression.codecs", "io.gzinga.hadoop.SplittableGZipCodec"); conf.set("mapreduce.input.fileinputformat.split.maxsize", "20000"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(WordCount.TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path("target/test/testfile1.gz")); FileOutputFormat.setOutputPath(job, new Path("target/test/testfile2")); job.waitForCompletion(true); BufferedReader br = new BufferedReader( new InputStreamReader(fs.open(new Path("target/test/testfile2/part-r-00000")))); Assert.assertEquals("This\t10000", br.readLine()); Assert.assertEquals("is\t10000", br.readLine()); Assert.assertEquals("line\t10000", br.readLine()); br.close(); } catch (Exception e) { e.printStackTrace(); Assert.fail(); } finally { FileUtil.fullyDelete(new File("target/test/testfile2")); FileUtil.fullyDelete(new File("target/test/testfile1.gz")); } }
From source file:io.hops.erasure_coding.Encoder.java
License:Apache License
/** * The interface to use to generate a parity file. * This method can be called multiple times with the same Encoder object, * thus allowing reuse of the buffers allocated by the Encoder object. * * @param fs/*from w w w . j a v a 2 s.c om*/ * The filesystem containing the source file. * @param srcFile * The source file. * @param parityFile * The parity file to be generated. */ public void encodeFile(Configuration jobConf, FileSystem fs, Path srcFile, FileSystem parityFs, Path parityFile, short parityRepl, long numStripes, long blockSize, Progressable reporter, StripeReader sReader) throws IOException { long expectedParityBlocks = numStripes * codec.parityLength; long expectedParityFileSize = numStripes * blockSize * codec.parityLength; if (!parityFs.mkdirs(parityFile.getParent())) { throw new IOException("Could not create parent dir " + parityFile.getParent()); } // delete destination if exists if (parityFs.exists(parityFile)) { parityFs.delete(parityFile, false); } // Writing out a large parity file at replication 1 is difficult since // some datanode could die and we would not be able to close() the file. // So write at replication 2 and then reduce it after close() succeeds. short tmpRepl = parityRepl; if (expectedParityBlocks >= conf.getInt("raid.encoder.largeparity.blocks", 20)) { if (parityRepl == 1) { tmpRepl = 2; } } FSDataOutputStream out = parityFs.create(parityFile, true, conf.getInt("io.file.buffer.size", 64 * 1024), tmpRepl, blockSize); DFSOutputStream dfsOut = (DFSOutputStream) out.getWrappedStream(); dfsOut.enableParityStream(codec.getStripeLength(), codec.getParityLength(), srcFile.toUri().getPath()); try { encodeFileToStream(fs, srcFile, parityFile, sReader, blockSize, out, reporter); out.close(); out = null; LOG.info("Wrote parity file " + parityFile); FileStatus tmpStat = parityFs.getFileStatus(parityFile); if (tmpStat.getLen() != expectedParityFileSize) { throw new IOException("Expected parity size " + expectedParityFileSize + " does not match actual " + tmpStat.getLen()); } if (tmpRepl > parityRepl) { parityFs.setReplication(parityFile, parityRepl); } LOG.info("Wrote parity file " + parityFile); } finally { if (out != null) { out.close(); } } }
From source file:io.hops.experiments.utils.DFSOperationsUtils.java
License:Apache License
public static void mkdirs(FileSystem dfs, String pathStr) throws IOException { if (SERVER_LESS_MODE) { serverLessModeRandomWait();//from w w w. j a va2s . c o m return; } dfs.mkdirs(new Path(pathStr)); }
From source file:io.seqware.pipeline.plugins.sanity.checks.HDFS_Check.java
License:Open Source License
@Override public boolean check(QueryRunner qRunner, Metadata metadataWS) throws SQLException { FileSystem fileSystem = null; HashMap<String, String> settings = (HashMap<String, String>) ConfigTools.getSettings(); if (settings.isEmpty()) { return false; } else if (!settings.containsKey("FS.DEFAULTFS") || !settings.containsKey("FS.HDFS.IMPL")) { return false; } else if (!settings.containsKey("HBASE.ZOOKEEPER.QUORUM") || !settings.containsKey("HBASE.ZOOKEEPER.PROPERTY.CLIENTPORT") || !settings.containsKey("HBASE.MASTER") || !settings.containsKey("MAPRED.JOB.TRACKER")) { return false; }//from w w w .ja v a 2 s . c o m try { Configuration conf = new Configuration(); conf.set("hbase.zookeeper.quorum", settings.get("HBASE.ZOOKEEPER.QUORUM")); conf.set("hbase.zookeeper.property.clientPort", settings.get("HBASE.ZOOKEEPER.PROPERTY.CLIENTPORT")); conf.set("hbase.master", settings.get("HBASE.MASTER")); conf.set("mapred.job.tracker", settings.get("MAPRED.JOB.TRACKER")); conf.set("fs.default.name", settings.get("FS.DEFAULTFS")); conf.set("fs.defaultfs", settings.get("FS.DEFAULTFS")); conf.set("fs.hdfs.impl", settings.get("FS.HDFS.IMPL")); fileSystem = FileSystem.get(conf); Path path = new Path("test"); fileSystem.mkdirs(path); fileSystem.deleteOnExit(path); } catch (IOException ex) { System.err.println("Error connecting to hdfs" + ex.getMessage()); return false; } finally { try { if (fileSystem != null) { fileSystem.close(); } } catch (IOException ex) { Logger.getLogger(HDFS_Check.class.getName()).log(Level.SEVERE, null, ex); } } return true; }
From source file:it.isislab.sof.core.engine.hadoop.mapreduce.generic.SOFReducerGeneric.java
License:Apache License
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String EVALUATION_PROGRAM_THREAD = "evaluation" + Thread.currentThread().getId(); FileSystem fs = FileSystem.get(conf); if (ISLOOP) { Path eprogram = new Path(EVALUATION_PROGRAM_THREAD); fs.copyToLocalFile(new Path(RATING_PROGRAM), eprogram); try {/*w w w . j av a2 s . c o m*/ fs.mkdirs(new Path(this.RATING_PATH)); } catch (Exception e) { } } if (ISLOOP) { Random r = new Random(System.currentTimeMillis()); String id = MD5(key.toString() + r.nextDouble()); String tmpEvalXml = "tmpEval" + id + ".xml"; Path ptemp = new Path(tmpEvalXml); Path file_output = new Path(key.toString()); fs.copyToLocalFile(file_output, ptemp); String xmlOutput = key.toString().substring(key.toString().lastIndexOf("/") + 1); //generateEvaluation(tmpEvalXml,id,EVALUATION_PROGRAM_THREAD); generateEvaluation(tmpEvalXml, xmlOutput, EVALUATION_PROGRAM_THREAD); File f = new File(System.getProperty("user.dir") + "/" + EVALUATION_PROGRAM_THREAD); f.delete(); } }
From source file:it.isislab.sof.core.engine.hadoop.mapreduce.netlogo.SOFReducerNetLogo.java
License:Apache License
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String EVALUATION_PROGRAM_THREAD = "evaluation" + Thread.currentThread().getId(); FileSystem fs = FileSystem.get(conf); if (ISLOOP) { Path eprogram = new Path(EVALUATION_PROGRAM_THREAD); fs.copyToLocalFile(new Path(RATING_PROGRAM), eprogram); try {/* w w w. ja va2 s .c o m*/ fs.mkdirs(new Path(RATING_PATH)); } catch (Exception e) { } } if (ISLOOP) { Random r = new Random(System.currentTimeMillis()); String id = MD5(key.toString() + r.nextDouble()); String tmpEvalXml = "tmpEval" + id + ".xml"; Path ptemp = new Path(tmpEvalXml); Path file_output = new Path(key.toString()); fs.copyToLocalFile(file_output, ptemp); //generateEvaluation(tmpEvalXml,id,EVALUATION_PROGRAM_THREAD); String xmlOutput = key.toString().substring(key.toString().lastIndexOf("/") + 1); generateEvaluation(tmpEvalXml, xmlOutput, EVALUATION_PROGRAM_THREAD); File f = new File(System.getProperty("user.dir") + "/" + EVALUATION_PROGRAM_THREAD); f.delete(); } }
From source file:it.tizianofagni.sparkboost.DataUtils.java
License:Apache License
/** * Write a text file on Hadoop file system by using standard Hadoop API. * * @param outputPath The file to be written. * @param content The content to put in the file. *///from w w w. j a v a 2 s .co m public static void saveHadoopTextFile(String outputPath, String content) { try { Configuration configuration = new Configuration(); Path file = new Path(outputPath); Path parentFile = file.getParent(); FileSystem hdfs = FileSystem.get(file.toUri(), configuration); if (parentFile != null) hdfs.mkdirs(parentFile); OutputStream os = hdfs.create(file, true); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(os, "UTF-8")); br.write(content); br.close(); hdfs.close(); } catch (Exception e) { throw new RuntimeException("Writing Hadoop text file", e); } }
From source file:ivory.app.PreprocessClueWebEnglish.java
License:Apache License
/** * Runs this tool./* w w w . ja v a 2 s . c om*/ */ @SuppressWarnings({ "static-access" }) @Override public int run(String[] args) throws Exception { Options options = new Options(); ; options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path") .create(PreprocessCollection.COLLECTION_PATH)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) index path") .create(PreprocessCollection.INDEX_PATH)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("(required) segment").create(SEGMENT)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(PreprocessCollection.COLLECTION_PATH) || !cmdline.hasOption(PreprocessCollection.INDEX_PATH) || !cmdline.hasOption(SEGMENT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String collection = cmdline.getOptionValue(PreprocessCollection.COLLECTION_PATH); String indexPath = cmdline.getOptionValue(PreprocessCollection.INDEX_PATH); int segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT)); LOG.info("Tool name: " + PreprocessClueWebEnglish.class.getSimpleName()); LOG.info(" - collection path: " + collection); LOG.info(" - index path: " + indexPath); LOG.info(" - segement: " + segment); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.info("index path doesn't exist, creating..."); fs.mkdirs(p); } else { LOG.info("Index directory " + p + " already exists!"); return -1; } RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); new ClueWarcDocnoMappingBuilder().build(new Path(collection), mappingFile, conf); conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, ClueWarcDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, DOCNO_OFFSETS[segment]); conf.setInt(Constants.MinDf, 10); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.app.PreprocessCollection.java
License:Apache License
/** * Runs this tool.//from ww w . j a v a 2 s. co m */ @Override public int run(String[] args) throws Exception { Options options = createOptions(); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(COLLECTION_PATH) || !cmdline.hasOption(COLLECTION_NAME) || !cmdline.hasOption(INDEX_PATH) || !cmdline.hasOption(DOCNO_MAPPING)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String collection = cmdline.getOptionValue(COLLECTION_PATH); String collectionName = cmdline.getOptionValue(COLLECTION_NAME); String indexPath = cmdline.getOptionValue(INDEX_PATH); int docnoOffset = 0; if (cmdline.hasOption(DOCNO_OFFSET)) { docnoOffset = Integer.parseInt(cmdline.getOptionValue(DOCNO_OFFSET)); } Class<? extends DocnoMapping> docnoMappingClass = null; try { docnoMappingClass = (Class<? extends DocnoMapping>) Class .forName(cmdline.getOptionValue(DOCNO_MAPPING)); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } @SuppressWarnings("rawtypes") Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; if (cmdline.hasOption(INPUTFORMAT)) { try { inputFormatClass = (Class<? extends InputFormat<?, ?>>) Class .forName(cmdline.getOptionValue(INPUTFORMAT)); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } Class<? extends Tokenizer> tokenizerClass = GalagoTokenizer.class; if (cmdline.hasOption(TOKENIZER)) { try { tokenizerClass = (Class<? extends Tokenizer>) Class.forName(cmdline.getOptionValue(TOKENIZER)); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } int minDf = 2; if (cmdline.hasOption(MIN_DF)) { minDf = Integer.parseInt(cmdline.getOptionValue(MIN_DF)); } LOG.info("Tool name: " + this.getClass().getSimpleName()); LOG.info(String.format(" -%s %s", COLLECTION_PATH, collection)); LOG.info(String.format(" -%s %s", COLLECTION_NAME, collectionName)); LOG.info(String.format(" -%s %s", INDEX_PATH, indexPath)); LOG.info(String.format(" -%s %s", DOCNO_MAPPING, docnoMappingClass.getCanonicalName())); LOG.info(String.format(" -%s %s", INPUTFORMAT, inputFormatClass.getCanonicalName())); LOG.info(String.format(" -%s %s", TOKENIZER, tokenizerClass.getCanonicalName())); LOG.info(String.format(" -%s %d", MIN_DF, minDf)); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.info("Index directory " + p + " doesn't exist, creating."); fs.mkdirs(p); } else { LOG.info("Index directory " + p + " already exists!"); return -1; } RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); conf.set(Constants.CollectionName, collectionName); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, inputFormatClass.getCanonicalName()); conf.set(Constants.Tokenizer, tokenizerClass.getCanonicalName()); conf.set(Constants.DocnoMappingClass, docnoMappingClass.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, docnoOffset); conf.setInt(Constants.MinDf, minDf); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); Path mappingFile = env.getDocnoMappingData(); docnoMappingClass.newInstance().getBuilder().build(new Path(collection), mappingFile, conf); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.app.PreprocessTrecForeign.java
License:Apache License
@SuppressWarnings("static-access") private Configuration parseArgs(String[] args) { Configuration conf = getConf(); options = new Options(); options.addOption(OptionBuilder.withDescription("tokenizer class").withArgName("class").hasArg() .isRequired().create(TOKENIZER_CLASS_OPTION)); options.addOption(OptionBuilder.withDescription("path to tokenizer model file/directory") .withArgName("path").hasArg().create(TOKENIZER_MODEL_OPTION)); options.addOption(OptionBuilder.withDescription("path to index directory").withArgName("path").hasArg() .isRequired().isRequired().create(INDEX_PATH_OPTION)); options.addOption(OptionBuilder.withDescription("path to XML collection file").withArgName("path").hasArg() .isRequired().create(INPUT_PATH_OPTION)); options.addOption(OptionBuilder.withDescription("two-letter collection language code") .withArgName("en|de|fr|zh|es|ar|tr").hasArg().isRequired().create(LANGUAGE_OPTION)); options.addOption(OptionBuilder.withDescription("path to stopwords file").withArgName("path").hasArg() .create(STOPWORDS_OPTION));//from w w w. ja v a 2s . c o m options.addOption(OptionBuilder.withDescription("collection name").withArgName("path").hasArg() .create(COLLECTION_NAME_OPTION)); try { FileSystem fs = FileSystem.get(conf); CommandLine cmdline; CommandLineParser parser = new GnuParser(); cmdline = parser.parse(options, args); String collection = cmdline.getOptionValue(INPUT_PATH_OPTION); String indexRootPath = cmdline.getOptionValue(INDEX_PATH_OPTION); String language = cmdline.getOptionValue(LANGUAGE_OPTION); String tokenizerClass = cmdline.getOptionValue(TOKENIZER_CLASS_OPTION); String stopwordsFile = null; String tokenizerPath = null; conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.Tokenizer, tokenizerClass); conf.set(Constants.Language, language); if (cmdline.hasOption(COLLECTION_NAME_OPTION)) { conf.set(Constants.CollectionName, cmdline.getOptionValue(COLLECTION_NAME_OPTION)); } if (cmdline.hasOption(STOPWORDS_OPTION)) { stopwordsFile = cmdline.getOptionValue(STOPWORDS_OPTION); conf.set(Constants.StopwordList, stopwordsFile); } if (cmdline.hasOption(TOKENIZER_MODEL_OPTION)) { tokenizerPath = cmdline.getOptionValue(TOKENIZER_MODEL_OPTION); conf.set(Constants.TokenizerData, tokenizerPath); } LOG.info("Tool name: " + PreprocessTrecForeign.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); LOG.info(" - Language: " + language); LOG.info(" - Stop-word removal?: " + stopwordsFile); LOG.info(" - Tokenizer class: " + tokenizerClass); LOG.info(" - Tokenizer path: " + tokenizerPath); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } } catch (IOException exp) { LOG.info("Error creating index directory: " + exp.getMessage()); exp.printStackTrace(); } catch (ParseException exp) { LOG.info("Error parsing command line: " + exp.getMessage()); throw new RuntimeException(); } return conf; }