List of usage examples for org.apache.hadoop.fs Path makeQualified
@Deprecated
public Path makeQualified(FileSystem fs)
From source file:it.crs4.seal.read_sort.ReadSort.java
License:Open Source License
public static Path getAnnotationPath(Configuration conf) throws IOException { String annotationName = conf.get(ReadSort.REF_ANN_PROP_NAME); if (annotationName == null) throw new RuntimeException("missing property " + REF_ANN_PROP_NAME); LOG.info("reading reference annotation from " + annotationName); Path annPath = new Path(annotationName); FileSystem srcFs;/* ww w . ja v a 2s.co m*/ if (conf.get("mapred.cache.archives") != null) { // we're using the distributed cache for the reference, // so it's on the local file system srcFs = FileSystem.getLocal(conf); } else srcFs = annPath.getFileSystem(conf); return annPath.makeQualified(srcFs); }
From source file:it.crs4.seal.read_sort.ReadSortOptionParser.java
License:Open Source License
@Override protected CommandLine parseOptions(Configuration conf, String[] args) throws IOException, ParseException { CommandLine line = super.parseOptions(conf, args); /********* distributed reference and annotations *********/ if (line.hasOption(distReference.getOpt())) { // Distribute the reference archive, and create a // symlink "reference" to the directory Path optPath = new Path(line.getOptionValue(distReference.getOpt())); optPath = optPath.makeQualified(optPath.getFileSystem(conf)); Path cachePath = new Path(optPath.toString() + "#reference"); conf.set("mapred.cache.archives", cachePath.toString()); conf.set("mapred.create.symlink", "yes"); if (line.hasOption(ann.getOpt())) conf.set(ReadSort.REF_ANN_PROP_NAME, "reference/" + line.getOptionValue(ann.getOpt())); else//from ww w.j a va 2 s . c o m throw new ParseException( "You must specify the name of the annotation file within the distributed reference archive with -" + ann.getOpt()); } else if (line.hasOption(ann.getOpt())) { // direct access to the reference annotation conf.set(ReadSort.REF_ANN_PROP_NAME, line.getOptionValue(ann.getOpt())); } else throw new ParseException("You must provide the path the reference annotation file (<ref>.ann)"); conf.set(ClusterUtils.NUM_RED_TASKS_PROPERTY, String.valueOf(getNReduceTasks())); return line; }
From source file:it.crs4.seal.tsv_sort.TsvSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); TsvSortOptionParser parser = new TsvSortOptionParser(); parser.parse(getConf(), args);//from w ww .ja v a 2 s . com LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); Job job = new Job(getConf()); job.setJobName("TsvSort " + parser.getInputPaths().get(0)); job.setJarByClass(TsvSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TsvInputFormat.class); job.setOutputFormatClass(TextValueOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); // output path FileOutputFormat.setOutputPath(job, parser.getOutputPath()); FileSystem fs = parser.getOutputPath().getFileSystem(job.getConfiguration()); /* * * Pick a random name for the partition file in the same directory as the * output path. So, TsvSort /user/me/input /user/me/output * results in the partition file being placed in /user/me/_partition.lst.12340921387402174 * * Why not place it directly in the input path? * * We wouldn't be able to run two sorts on the same data at the same time. * We've received complaints about this in the past, so it has been a * limit in practice. * * Why not place it directly in the output path? * * We'd have to create the output path before the output format did. * For this to work we'd have to disable the FileOutputFormat's default check * that verifies that the output directory doesn't exist. This means that we'd * need some other way to ensure that we're not writing to the same path where * some other job wrote. */ Path partitionFile; Random rnd = new Random(); do { partitionFile = new Path(parser.getOutputPath().getParent(), String.format("_partition.lst.%012d", Math.abs(rnd.nextLong()))); } while (fs.exists(partitionFile)); // this is still subject to a race condition between it and another instance of this program partitionFile = partitionFile.makeQualified(fs); LOG.info("partition file path: " + partitionFile); URI partitionUri = new URI(partitionFile.toString() + "#" + PARTITION_SYMLINK); LOG.debug("partitionUri for distributed cache: " + partitionUri); // input paths for (Path p : parser.getInputPaths()) TsvInputFormat.addInputPath(job, p); LOG.info("sampling input"); TextSampler.writePartitionFile(new TsvInputFormat(), job, partitionFile); LOG.info("created partitions"); try { DistributedCache.addCacheFile(partitionUri, job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); int retcode = job.waitForCompletion(true) ? 0 : 1; LOG.info("done"); return retcode; } finally { LOG.debug("deleting partition file " + partitionFile); fs.delete(partitionFile, false); } }
From source file:ml.shifu.guagua.hadoop.io.GuaguaOptionsParser.java
License:Apache License
/** * Take input as a comma separated list of files and verifies if they exist. It defaults for file:/// if the files * specified do not have a scheme. it returns the paths uri converted defaulting to file:///. So an input of * /home/user/file1,/home/user/file2 would return file:///home/user/file1,file:///home/user/file2 *//*from www .jav a 2 s .co m*/ @SuppressWarnings("deprecation") private String validateFiles(String files, Configuration conf) throws IOException { if (files == null) return null; String[] fileArr = files.split(FILE_SEPERATOR); String[] finalArr = new String[fileArr.length]; for (int i = 0; i < fileArr.length; i++) { String tmp = fileArr[i]; String finalPath; URI pathURI; try { pathURI = new URI(tmp); } catch (URISyntaxException e) { throw new IllegalArgumentException(e); } Path path = new Path(pathURI); FileSystem localFs = FileSystem.getLocal(conf); if (pathURI.getScheme() == null) { // default to the local file system // check if the file exists or not first if (!localFs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(localFs).toString(); } else { // check if the file exists in this file system // we need to recreate this filesystem object to copy // these files to the file system jobtracker is running // on. FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(fs).toString(); } finalArr[i] = finalPath; } return StringUtils.arrayToString(finalArr); }
From source file:ml.shifu.guagua.mapreduce.GuaguaOptionsParser.java
License:Apache License
/** * Take input as a comma separated list of files and verifies if they exist. It defaults for file:/// if the files * specified do not have a scheme. it returns the paths uri converted defaulting to file:///. So an input of * /home/user/file1,/home/user/file2 would return file:///home/user/file1,file:///home/user/file2 *///from w w w . ja va 2 s . c o m private String validateFiles(String files, Configuration conf) throws IOException { if (files == null) return null; String[] fileArr = files.split(FILE_SEPERATOR); String[] finalArr = new String[fileArr.length]; for (int i = 0; i < fileArr.length; i++) { String tmp = fileArr[i]; String finalPath; URI pathURI; try { pathURI = new URI(tmp); } catch (URISyntaxException e) { throw new IllegalArgumentException(e); } Path path = new Path(pathURI.toString()); FileSystem localFs = FileSystem.getLocal(conf); if (pathURI.getScheme() == null) { // default to the local file system // check if the file exists or not first if (!localFs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(localFs).toString(); } else { // check if the file exists in this file system // we need to recreate this filesystem object to copy // these files to the file system jobtracker is running // on. FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(fs).toString(); } finalArr[i] = finalPath; } return StringUtils.arrayToString(finalArr); }
From source file:ml.shifu.guagua.yarn.GuaguaOptionsParser.java
License:Apache License
/** * Take input as a comma separated list of files and verifies if they exist. It defaults for file:/// if the files * specified do not have a scheme. it returns the paths uri converted defaulting to file:///. So an input of * /home/user/file1,/home/user/file2 would return file:///home/user/file1,file:///home/user/file2 *//* ww w . j av a 2 s . c o m*/ private String validateFiles(String files, Configuration conf) throws IOException { if (files == null) return null; String[] fileArr = files.split(FILE_SEPERATOR); String[] finalArr = new String[fileArr.length]; for (int i = 0; i < fileArr.length; i++) { String tmp = fileArr[i]; String finalPath; URI pathURI; try { pathURI = new URI(tmp); } catch (URISyntaxException e) { throw new IllegalArgumentException(e); } Path path = new Path(pathURI); FileSystem localFs = FileSystem.getLocal(conf); if (pathURI.getScheme() == null) { // default to the local file system // check if the file exists or not first if (!localFs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(localFs).toString(); } else { // check if the file exists in this file system // we need to recreate this filesystem object to copy // these files to the file system jobtracker is running // on. FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(fs).toString(); } finalArr[i] = finalPath; } return StringUtils.arrayToString(finalArr); }
From source file:my.mahout.AbstractJob.java
License:Apache License
/** * necessary to make this job (having a combined input path) work on Amazon S3, hopefully this is * obsolete when MultipleInputs is available again *///www. j a va2s . c o m public static void setS3SafeCombinedInputPath(Job job, Path referencePath, Path inputPathOne, Path inputPathTwo) throws IOException { FileSystem fs = FileSystem.get(referencePath.toUri(), job.getConfiguration()); FileInputFormat.setInputPaths(job, inputPathOne.makeQualified(fs), inputPathTwo.makeQualified(fs)); }
From source file:nl.gridline.zieook.inx.movielens.AggregateAndRecommendReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException { Configuration jobConf = context.getConfiguration(); recommendationsPerUser = jobConf.getInt(NUM_RECOMMENDATIONS, DEFAULT_NUM_RECOMMENDATIONS); booleanData = jobConf.getBoolean(RecommenderJob.BOOLEAN_DATA, false); indexItemIDMap = TasteHadoopUtils.readItemIDIndexMap(jobConf.get(ITEMID_INDEX_PATH), jobConf); FSDataInputStream in = null;/*from w w w .j a va2 s .com*/ try { String itemFilePathString = jobConf.get(ITEMS_FILE); if (itemFilePathString == null) { itemsToRecommendFor = null; } else { Path unqualifiedItemsFilePath = new Path(itemFilePathString); FileSystem fs = FileSystem.get(unqualifiedItemsFilePath.toUri(), jobConf); itemsToRecommendFor = new FastIDSet(); Path itemsFilePath = unqualifiedItemsFilePath.makeQualified(fs); in = fs.open(itemsFilePath); for (String line : new FileLineIterable(in)) { itemsToRecommendFor.add(Long.parseLong(line)); } } } finally { IOUtils.closeStream(in); } }
From source file:nl.gridline.zieook.inx.movielens.UserVectorSplitterMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException { Configuration jobConf = context.getConfiguration(); maxPrefsPerUserConsidered = jobConf.getInt(MAX_PREFS_PER_USER_CONSIDERED, DEFAULT_MAX_PREFS_PER_USER_CONSIDERED); String usersFilePathString = jobConf.get(USERS_FILE); if (usersFilePathString != null) { FSDataInputStream in = null;// ww w . ja v a2 s .c om try { Path unqualifiedUsersFilePath = new Path(usersFilePathString); FileSystem fs = FileSystem.get(unqualifiedUsersFilePath.toUri(), jobConf); usersToRecommendFor = new FastIDSet(); Path usersFilePath = unqualifiedUsersFilePath.makeQualified(fs); in = fs.open(usersFilePath); for (String line : new FileLineIterable(in)) { usersToRecommendFor.add(Long.parseLong(line)); } } finally { IOUtils.closeStream(in); } } }
From source file:nl.gridline.zieook.runners.cf.RecommenderJobZieOok.java
License:Apache License
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption();/* ww w.ja v a2s . c om*/ addOutputOption(); addOption("numRecommendations", "n", "Number of recommendations per user", String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS)); addOption("usersFile", "u", "File of users to recommend for", null); addOption("itemsFile", "i", "File of items to recommend for", null); addOption("filterFile", "f", "File containing comma-separated userID,itemID pairs. Used to exclude the item from " + "the recommendations for that user (optional)", null); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); addOption("maxPrefsPerUser", "mp", "Maximum number of preferences considered per user in final recommendation phase", String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this in the similarity computation " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ", String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM)); addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per item to this " + "number (default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM)); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')', String.valueOf(SimilarityType.SIMILARITY_COOCCURRENCE)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); int numRecommendations = Integer.parseInt(parsedArgs.get("--numRecommendations")); String usersFile = parsedArgs.get("--usersFile"); String itemsFile = parsedArgs.get("--itemsFile"); String filterFile = parsedArgs.get("--filterFile"); boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData")); int maxPrefsPerUser = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser")); int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser")); int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem")); int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem")); String similarityClassname = parsedArgs.get("--similarityClassname"); Path userVectorPath = new Path(tempDirPath, "userVectors"); Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex"); Path countUsersPath = new Path(tempDirPath, "countUsers"); Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix"); Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix"); Path prePartialMultiplyPath1 = new Path(tempDirPath, "prePartialMultiply1"); Path prePartialMultiplyPath2 = new Path(tempDirPath, "prePartialMultiply2"); Path explicitFilterPath = new Path(tempDirPath, "explicitFilterPath"); Path partialMultiplyPath = new Path(tempDirPath, "partialMultiply"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); task.setCurrentJob(itemIDIndex).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVector.getConfiguration().setBoolean(BOOLEAN_DATA, booleanData); toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser); task.setCurrentJob(toUserVector).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class, CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class, CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class); countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class); countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class); task.setCurrentJob(countUsers).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath, SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class, DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem); task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true); } int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath); if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like * new DistributedRowMatrix(...).rowSimilarity(...) */ try { ToolRunner.run(getConf(), new RowSimilarityZieOok(), new String[] { // "--input", itemUserMatrixPath.toString(), // "--output", similarityMatrixPath.toString(), // "--numberOfColumns", String.valueOf(numberOfUsers), // "--similarityClassname", similarityClassname, // "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem + 1), // "--tempDir", tempDirPath.toString() }); } catch (Exception e) { throw new IllegalStateException("item-item-similarity computation failed", e); } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job prePartialMultiply1 = prepareJob(similarityMatrixPath, prePartialMultiplyPath1, SequenceFileInputFormat.class, SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class, VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, SequenceFileOutputFormat.class); task.setCurrentJob(prePartialMultiply1).waitForCompletion(true); Job prePartialMultiply2 = prepareJob(userVectorPath, prePartialMultiplyPath2, SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class, VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, SequenceFileOutputFormat.class); if (usersFile != null) { prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile); } prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser); task.setCurrentJob(prePartialMultiply2).waitForCompletion(true); Job partialMultiply = prepareJob(new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2), partialMultiplyPath, SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class, VectorOrPrefWritable.class, ToVectorAndPrefReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, SequenceFileOutputFormat.class); /* necessary to make this job (having a combined input path) work on Amazon S3 */ Configuration partialMultiplyConf = partialMultiply.getConfiguration(); FileSystem fs = FileSystem.get(tempDirPath.toUri(), partialMultiplyConf); prePartialMultiplyPath1 = prePartialMultiplyPath1.makeQualified(fs); prePartialMultiplyPath2 = prePartialMultiplyPath2.makeQualified(fs); FileInputFormat.setInputPaths(partialMultiply, prePartialMultiplyPath1, prePartialMultiplyPath2); task.setCurrentJob(partialMultiply).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* convert the user/item pairs to filter if a filterfile has been specified */ if (filterFile != null) { Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class, ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class, ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, SequenceFileOutputFormat.class); task.setCurrentJob(itemFiltering).waitForCompletion(true); } String aggregateAndRecommendInput = partialMultiplyPath.toString(); if (filterFile != null) { aggregateAndRecommendInput += "," + explicitFilterPath; } Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class, SequenceFileOutputFormat.class); Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration(); if (itemsFile != null) { aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile); } if (filterFile != null) { /* necessary to make this job (having a combined input path) work on Amazon S3 */ FileSystem fs = FileSystem.get(tempDirPath.toUri(), aggregateAndRecommendConf); partialMultiplyPath = partialMultiplyPath.makeQualified(fs); explicitFilterPath = explicitFilterPath.makeQualified(fs); FileInputFormat.setInputPaths(aggregateAndRecommend, partialMultiplyPath, explicitFilterPath); } setIOSort(aggregateAndRecommend); aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH, itemIDIndexPath.toString()); aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations); aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData); task.setCurrentJob(aggregateAndRecommend).waitForCompletion(true); } return 0; }