List of usage examples for org.apache.hadoop.fs FileSystem isFile
@Deprecated public boolean isFile(Path f) throws IOException
From source file:org.apache.mahout.classifier.rbm.test.TestRBMClassifierJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*ww w . j a v a2 s. co m*/ addOption("model", "m", "The path to the model built during training", true); addOption("labelcount", "lc", "total count of labels existent in the training set", true); addOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION, "max", "least number of stable iterations in classification layer when classifying", "10"); addOption(new DefaultOptionBuilder().withLongName(DefaultOptionCreator.MAPREDUCE_METHOD).withRequired(false) .withDescription("Run tests with map/reduce").withShortName("mr").create()); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int labelcount = Integer.parseInt(getOption("labelcount")); iterations = Integer.parseInt(getOption("maxIter")); //check models existence Path model = new Path(parsedArgs.get("--model")); if (!model.getFileSystem(getConf()).exists(model)) { log.error("Model file does not exist!"); return -1; } //create the list of all labels List<String> lables = new ArrayList<String>(); for (int i = 0; i < labelcount; i++) lables.add(String.valueOf(i)); FileSystem fs = getInputPath().getFileSystem(getConf()); ResultAnalyzer analyzer = new ResultAnalyzer(lables, "-1"); //initiate the paths to the test batches Path[] batches; if (fs.isFile(getInputPath())) batches = new Path[] { getInputPath() }; else { FileStatus[] stati = fs.listStatus(getInputPath()); batches = new Path[stati.length]; for (int i = 0; i < stati.length; i++) { batches[i] = stati[i].getPath(); } } if (hasOption("mapreduce")) HadoopUtil.delete(getConf(), getTempPath("testresults")); for (Path input : batches) { if (hasOption("mapreduce")) { HadoopUtil.cacheFiles(model, getConf()); //the output key is the expected value, the output value are the scores for all the labels Job testJob = prepareJob(input, getTempPath("testresults"), SequenceFileInputFormat.class, TestRBMClassifierMapper.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); testJob.getConfiguration().set("maxIter", String.valueOf(iterations)); testJob.waitForCompletion(true); //loop over the results and create the confusion matrix SequenceFileDirIterable<IntWritable, VectorWritable> dirIterable = new SequenceFileDirIterable<IntWritable, VectorWritable>( getTempPath("testresults"), PathType.LIST, PathFilters.partFilter(), getConf()); analyzeResults(dirIterable, analyzer); } else { //test job locally runTestsLocally(model, analyzer, input); } } //output the result of the tests log.info("RBMClassifier Results: {}", analyzer); //stop all running threads if (executor != null) executor.shutdownNow(); return 0; }
From source file:org.apache.mahout.classifier.rbm.training.RBMClassifierTrainingJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w ww . j a v a2 s . co m*/ addOutputOption(); addOption("epochs", "e", "number of training epochs through the trainingset", true); addOption("structure", "s", "comma-separated list of layer sizes", false); addOption("labelcount", "lc", "total count of labels existent in the training set", true); addOption("learningrate", "lr", "learning rate at the beginning of training", "0.005"); addOption("momentum", "m", "momentum of learning at the beginning", "0.5"); addOption("rbmnr", "nr", "rbm to train, < 0 means train all", "-1"); addOption("nrgibbs", "gn", "number of gibbs sampling used in contrastive divergence", "5"); addOption(new DefaultOptionBuilder().withLongName(DefaultOptionCreator.MAPREDUCE_METHOD).withRequired(false) .withDescription("Run training with map/reduce").withShortName("mr").create()); addOption(new DefaultOptionBuilder().withLongName("nogreedy").withRequired(false) .withDescription("Don't run greedy pre training").withShortName("ng").create()); addOption(new DefaultOptionBuilder().withLongName("nofinetuning").withRequired(false) .withDescription("Don't run fine tuning at the end").withShortName("nf").create()); addOption(new DefaultOptionBuilder().withLongName("nobiases").withRequired(false) .withDescription("Don't initialize biases").withShortName("nb").create()); addOption(new DefaultOptionBuilder().withLongName("monitor").withRequired(false) .withDescription("If present, errors can be monitored in cosole").withShortName("mon").create()); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); FileSystem fs = FileSystem.get(output.toUri(), getConf()); labelcount = Integer.parseInt(getOption("labelcount")); boolean local = !hasOption("mapreduce"); monitor = hasOption("monitor"); initbiases = !hasOption("nobiases"); finetuning = !hasOption("nofinetuning"); greedy = !hasOption("nogreedy"); if (fs.isFile(input)) batches = new Path[] { input }; else { FileStatus[] stati = fs.listStatus(input); batches = new Path[stati.length]; for (int i = 0; i < stati.length; i++) { batches[i] = stati[i].getPath(); } } epochs = Integer.valueOf(getOption("epochs")); learningrate = Double.parseDouble(getOption("learningrate")); momentum = Double.parseDouble(getOption("momentum")); rbmNrtoTrain = Integer.parseInt(getOption("rbmnr")); nrGibbsSampling = Integer.parseInt(getOption("nrgibbs")); boolean initialize = hasOption(DefaultOptionCreator.OVERWRITE_OPTION) || !fs.exists(output) || fs.listStatus(output).length <= 0; if (initialize) { String structure = getOption("structure"); if (structure == null || structure.isEmpty()) return -1; String[] layers = structure.split(","); if (layers.length < 2) { return -1; } int[] actualLayerSizes = new int[layers.length]; for (int i = 0; i < layers.length; i++) { actualLayerSizes[i] = Integer.parseInt(layers[i]); } rbmCl = new RBMClassifier(labelcount, actualLayerSizes); logger.info("New model initialized!"); } else { rbmCl = RBMClassifier.materialize(output, getConf()); logger.info("Model found and materialized!"); } HadoopUtil.setSerializations(getConf()); lastUpdate = new Matrix[rbmCl.getDbm().getRbmCount()]; if (initbiases) { //init biases! Vector biases = null; int counter = 0; for (Path batch : batches) { for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>( batch, getConf())) { if (biases == null) biases = record.getSecond().get().clone(); else biases.plus(record.getSecond().get()); counter++; } } if (biases == null) { logger.info("No training data found!"); return -1; } rbmCl.getDbm().getLayer(0).setBiases(biases.divide(counter)); logger.info("Biases initialized"); } //greedy pre training with gradually decreasing learningrates if (greedy) { if (!local) rbmCl.serialize(output, getConf()); double tempLearningrate = learningrate; if (rbmNrtoTrain < 0) //train all rbms for (int rbmNr = 0; rbmNr < rbmCl.getDbm().getRbmCount(); rbmNr++) { tempLearningrate = learningrate; //double weights if dbm was materialized, because it was halved after greedy pretraining if (!initialize && rbmNrtoTrain > 0 && rbmNrtoTrain < rbmCl.getDbm().getRbmCount() - 1) { ((SimpleRBM) rbmCl.getDbm().getRBM(rbmNr)).setWeightMatrix( ((SimpleRBM) rbmCl.getDbm().getRBM(rbmNr)).getWeightMatrix().times(2)); } for (int j = 0; j < epochs; j++) { logger.info("Greedy training, epoch " + (j + 1) + "\nCurrent learningrate: " + tempLearningrate); for (int b = 0; b < batches.length; b++) { tempLearningrate -= learningrate / (epochs * batches.length + epochs); if (local) { if (!trainGreedySeq(rbmNr, batches[b], j, tempLearningrate)) return -1; } else if (!trainGreedyMR(rbmNr, batches[b], j, tempLearningrate)) return -1; if (monitor && (batches.length > 19) && (b + 1) % (batches.length / 20) == 0) logger.info(rbmNr + "-RBM: " + Math.round(((double) b + 1) / batches.length * 100.0) + "% in epoch done!"); } logger.info(Math.round(((double) j + 1) / epochs * 100) + "% of training on rbm number " + rbmNr + " is done!"); if (monitor) { double error = rbmError(batches[0], rbmNr); logger.info( "Average reconstruction error on batch " + batches[0].getName() + ": " + error); } rbmCl.serialize(output, getConf()); } //weight normalization to avoid double counting if (rbmNr > 0 && rbmNr < rbmCl.getDbm().getRbmCount() - 1) { ((SimpleRBM) rbmCl.getDbm().getRBM(rbmNrtoTrain)).setWeightMatrix( ((SimpleRBM) rbmCl.getDbm().getRBM(rbmNrtoTrain)).getWeightMatrix().times(0.5)); } } else { //double weights if dbm was materialized, because it was halved after greedy pretraining if (!initialize && rbmNrtoTrain > 0 && rbmNrtoTrain < rbmCl.getDbm().getRbmCount() - 1) { ((SimpleRBM) rbmCl.getDbm().getRBM(rbmNrtoTrain)).setWeightMatrix( ((SimpleRBM) rbmCl.getDbm().getRBM(rbmNrtoTrain)).getWeightMatrix().times(2)); } //train just wanted rbm for (int j = 0; j < epochs; j++) { logger.info( "Greedy training, epoch " + (j + 1) + "\nCurrent learningrate: " + tempLearningrate); for (int b = 0; b < batches.length; b++) { tempLearningrate -= learningrate / (epochs * batches.length + epochs); if (local) { if (!trainGreedySeq(rbmNrtoTrain, batches[b], j, tempLearningrate)) return -1; } else if (!trainGreedyMR(rbmNrtoTrain, batches[b], j, tempLearningrate)) return -1; if (monitor && (batches.length > 19) && (b + 1) % (batches.length / 20) == 0) logger.info(rbmNrtoTrain + "-RBM: " + Math.round(((double) b + 1) / batches.length * 100.0) + "% in epoch done!"); } logger.info(Math.round(((double) j + 1) / epochs * 100) + "% of training is done!"); if (monitor) { double error = rbmError(batches[0], rbmNrtoTrain); logger.info("Average reconstruction error on batch " + batches[0].getName() + ": " + error); } } //weight normalization to avoid double counting if (rbmNrtoTrain > 0 && rbmNrtoTrain < rbmCl.getDbm().getRbmCount() - 1) { ((SimpleRBM) rbmCl.getDbm().getRBM(rbmNrtoTrain)).setWeightMatrix( ((SimpleRBM) rbmCl.getDbm().getRBM(rbmNrtoTrain)).getWeightMatrix().times(0.5)); } } rbmCl.serialize(output, getConf()); logger.info("Pretraining done and model written to output"); } if (finetuning) { DeepBoltzmannMachine multiLayerDbm = null; double tempLearningrate = learningrate; //finetuning job for (int j = 0; j < epochs; j++) { for (int b = 0; b < batches.length; b++) { multiLayerDbm = rbmCl.initializeMultiLayerNN(); logger.info("Finetuning on batch " + batches[b].getName() + "\nCurrent learningrate: " + tempLearningrate); tempLearningrate -= learningrate / (epochs * batches.length + epochs); if (local) { if (!finetuneSeq(batches[b], j, multiLayerDbm, tempLearningrate)) return -1; } else if (!fintuneMR(batches[b], j, tempLearningrate)) return -1; logger.info("Finetuning: " + Math.round(((double) b + 1) / batches.length * 100.0) + "% in epoch done!"); } logger.info(Math.round(((double) j + 1) / epochs * 100) + "% of training is done!"); if (monitor) { double error = feedForwardError(multiLayerDbm, batches[0]); logger.info("Average discriminative error on batch " + batches[0].getName() + ": " + error); } } //final serialization rbmCl.serialize(output, getConf()); logger.info("RBM finetuning done and model written to output"); } if (executor != null) executor.shutdownNow(); return 0; }
From source file:org.apache.mahout.classifier.svm.datastore.HDFSReader.java
License:Apache License
public List<String> readADirectory(Path filePath) { List<String> lines = new ArrayList<String>(); try {/*from ww w .j a v a 2s .com*/ FileSystem fs = FileSystem.get(this.conf); if (!fs.isFile(filePath)) { FileStatus[] fsList = fs.listStatus(filePath); for (FileStatus file : fsList) { if (!file.isDir()) { lines.addAll(readAllLines(new Path(filePath.toString() + "/" + file.getPath().getName()))); } } } } catch (Exception e) { log.error("Exception: " + e.getMessage()); } return lines; }
From source file:org.apache.mahout.classifier.svm.datastore.HDFSReader.java
License:Apache License
public boolean isDir(Path filePath) { try {//w ww . j av a 2 s . co m FileSystem fs = FileSystem.get(this.conf); if (fs.isFile(filePath)) { return false; } else { return true; } } catch (Exception e) { log.error("Exception: " + e.getMessage()); return false; } }
From source file:org.apache.mahout.clustering.lda.cvb.InMemoryCollapsedVariationalBayes0.java
License:Apache License
private static Matrix loadVectors(String vectorPathString, Configuration conf) throws IOException { Path vectorPath = new Path(vectorPathString); FileSystem fs = vectorPath.getFileSystem(conf); List<Path> subPaths = Lists.newArrayList(); if (fs.isFile(vectorPath)) { subPaths.add(vectorPath);/*from ww w . ja v a 2s. co m*/ } else { for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) { subPaths.add(fileStatus.getPath()); } } List<Pair<Integer, Vector>> rowList = Lists.newArrayList(); int numRows = Integer.MIN_VALUE; int numCols = -1; boolean sequentialAccess = false; for (Path subPath : subPaths) { for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>( subPath, true, conf)) { int id = record.getFirst().get(); Vector vector = record.getSecond().get(); if (vector instanceof NamedVector) { vector = ((NamedVector) vector).getDelegate(); } if (numCols < 0) { numCols = vector.size(); sequentialAccess = vector.isSequentialAccess(); } rowList.add(Pair.of(id, vector)); numRows = Math.max(numRows, id); } } numRows++; Vector[] rowVectors = new Vector[numRows]; for (Pair<Integer, Vector> pair : rowList) { rowVectors[pair.getFirst()] = pair.getSecond(); } return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess); }
From source file:org.apache.mahout.text.WholeFileRecordReader.java
License:Apache License
@Override public boolean nextKeyValue() throws IOException { if (!processed) { byte[] contents = new byte[(int) fileSplit.getLength()]; Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(this.configuration); if (!fs.isFile(file)) { return false; }/*from ww w.java 2s.c o m*/ FileStatus[] fileStatuses; if (pathFilter != null) { fileStatuses = fs.listStatus(file, pathFilter); } else { fileStatuses = fs.listStatus(file); } FSDataInputStream in = null; if (fileStatuses.length == 1) { try { in = fs.open(fileStatuses[0].getPath()); IOUtils.readFully(in, contents, 0, contents.length); value.setCapacity(contents.length); value.set(contents, 0, contents.length); } finally { Closeables.close(in, false); } processed = true; return true; } } return false; }
From source file:org.apache.metamodel.util.HdfsResource.java
License:Apache License
@Override public long getSize() { final FileSystem fs = getHadoopFileSystem(); try {/*from w w w. ja v a2 s.co m*/ if (fs.isFile(getHadoopPath())) { return fs.getFileStatus(getHadoopPath()).getLen(); } else { return fs.getContentSummary(getHadoopPath()).getLength(); } } catch (Exception e) { throw wrapException(e); } finally { FileHelper.safeClose(fs); } }
From source file:org.apache.metamodel.util.HdfsResource.java
License:Apache License
@Override public InputStream read() throws ResourceException { final FileSystem fs = getHadoopFileSystem(); final InputStream in; try {//from w w w . j av a 2 s . c o m final Path hadoopPath = getHadoopPath(); // return a wrapper InputStream which manages the 'fs' closeable if (fs.isFile(hadoopPath)) { in = fs.open(hadoopPath); return new HdfsFileInputStream(in, fs); } else { return new HdfsDirectoryInputStream(hadoopPath, fs); } } catch (Exception e) { // we can close 'fs' in case of an exception FileHelper.safeClose(fs); throw wrapException(e); } }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
@SuppressWarnings("unchecked") void doOperations(Context context, Element element) throws ActionExecutorException { try {/* w ww. ja v a 2 s . co m*/ FileSystem fs = context.getAppFileSystem(); boolean recovery = fs.exists(getRecoveryPath(context)); if (!recovery) { fs.mkdirs(getRecoveryPath(context)); } Path nameNodePath = null; Element nameNodeElement = element.getChild("name-node", element.getNamespace()); if (nameNodeElement != null) { String nameNode = nameNodeElement.getTextTrim(); if (nameNode != null) { nameNodePath = new Path(nameNode); // Verify the name node now validatePath(nameNodePath, true); } } XConfiguration fsConf = new XConfiguration(); Path appPath = new Path(context.getWorkflow().getAppPath()); // app path could be a file if (fs.isFile(appPath)) { appPath = appPath.getParent(); } JavaActionExecutor.parseJobXmlAndConfiguration(context, element, appPath, fsConf); for (Element commandElement : (List<Element>) element.getChildren()) { String command = commandElement.getName(); if (command.equals("mkdir")) { Path path = getPath(commandElement, "path"); mkdir(context, fsConf, nameNodePath, path); } else { if (command.equals("delete")) { Path path = getPath(commandElement, "path"); boolean skipTrash = true; if (commandElement.getAttributeValue("skip-trash") != null && commandElement.getAttributeValue("skip-trash").equals("false")) { skipTrash = false; } delete(context, fsConf, nameNodePath, path, skipTrash); } else { if (command.equals("move")) { Path source = getPath(commandElement, "source"); Path target = getPath(commandElement, "target"); move(context, fsConf, nameNodePath, source, target, recovery); } else { if (command.equals("chmod")) { Path path = getPath(commandElement, "path"); boolean recursive = commandElement.getChild("recursive", commandElement.getNamespace()) != null; String str = commandElement.getAttributeValue("dir-files"); boolean dirFiles = (str == null) || Boolean.parseBoolean(str); String permissionsMask = commandElement.getAttributeValue("permissions").trim(); chmod(context, fsConf, nameNodePath, path, permissionsMask, dirFiles, recursive); } else { if (command.equals("touchz")) { Path path = getPath(commandElement, "path"); touchz(context, fsConf, nameNodePath, path); } else { if (command.equals("chgrp")) { Path path = getPath(commandElement, "path"); boolean recursive = commandElement.getChild("recursive", commandElement.getNamespace()) != null; String group = commandElement.getAttributeValue("group"); String str = commandElement.getAttributeValue("dir-files"); boolean dirFiles = (str == null) || Boolean.parseBoolean(str); chgrp(context, fsConf, nameNodePath, path, context.getWorkflow().getUser(), group, dirFiles, recursive); } } } } } } } } catch (Exception ex) { throw convertException(ex); } }
From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java
License:Apache License
/** * Move source to target//from www . j ava 2 s .co m * * @param context * @param fsConf * @param nameNodePath * @param source * @param target * @param recovery * @throws ActionExecutorException */ public void move(Context context, XConfiguration fsConf, Path nameNodePath, Path source, Path target, boolean recovery) throws ActionExecutorException { try { source = resolveToFullPath(nameNodePath, source, true); validateSameNN(source, target); FileSystem fs = getFileSystemFor(source, context, fsConf); Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(source)); if ((pathArr == null || pathArr.length == 0)) { if (!recovery) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS006", "move, source path [{0}] does not exist", source); } else { return; } } if (pathArr.length > 1 && (!fs.exists(target) || fs.isFile(target))) { if (!recovery) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS012", "move, could not rename multiple sources to the same target name"); } else { return; } } checkGlobMax(pathArr); for (Path p : pathArr) { if (!fs.rename(p, target) && !recovery) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS008", "move, could not move [{0}] to [{1}]", p, target); } } } catch (Exception ex) { throw convertException(ex); } }