List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.apache.lens.server.util.ScannedPaths.java
License:Apache License
/** * Method that computes path of resources matching the input path or path regex pattern. * If provided path is a directory it additionally checks for the jar_order or glob_order file * that imposes ordering of resources and filters out other resources. * * Updates finalPaths List with matched paths and returns an iterator for matched paths. *//*from www. java 2 s .c om*/ private List<String> getMatchedPaths(Path pt, String type) { List<String> finalPaths = new ArrayList<>(); InputStream resourceOrderIStream = null; FileSystem fs; try { fs = pt.getFileSystem(new Configuration()); if (fs.exists(pt)) { if (fs.isFile(pt)) { /** * CASE 1 : Direct FILE provided in path **/ finalPaths.add(pt.toUri().toString()); } else if (fs.isDirectory(pt)) { /** * CASE 2 : DIR provided in path **/ Path resourceOrderFile; FileStatus[] statuses; List<String> newMatches; List<String> resources; resourceOrderFile = new Path(pt, "jar_order"); /** Add everything in dir if no jar_order or glob_order is present **/ if (!fs.exists(resourceOrderFile)) { resourceOrderFile = new Path(pt, "glob_order"); if (!fs.exists(resourceOrderFile)) { resourceOrderFile = null; /** Get matched resources recursively for all files **/ statuses = fs.globStatus(new Path(pt, "*")); if (statuses != null) { for (FileStatus st : statuses) { newMatches = getMatchedPaths(st.getPath(), type); finalPaths.addAll(newMatches); } } } } if (resourceOrderFile != null) { /** Else get jars as per order specified in jar_order/glob_order **/ resourceOrderIStream = fs.open(resourceOrderFile); resources = IOUtils.readLines(resourceOrderIStream, Charset.forName("UTF-8")); for (String resource : resources) { if (StringUtils.isBlank(resource)) { continue; } resource = resource.trim(); /** Get matched resources recursively for provided path/pattern **/ if (resource.startsWith("/") || resource.contains(":/")) { newMatches = getMatchedPaths(new Path(resource), type); } else { newMatches = getMatchedPaths(new Path(pt, resource), type); } finalPaths.addAll(newMatches); } } } } else { /** * CASE 3 : REGEX provided in path * */ FileStatus[] statuses = fs.globStatus(Path.getPathWithoutSchemeAndAuthority(pt)); if (statuses != null) { for (FileStatus st : statuses) { List<String> newMatches = getMatchedPaths(st.getPath(), type); finalPaths.addAll(newMatches); } } } filterDirsAndJarType(fs, finalPaths); } catch (FileNotFoundException fex) { log.error("File not found while scanning path. Path: {}, Type: {}", path, type, fex); } catch (Exception e) { log.error("Exception while initializing PathScanner. Path: {}, Type: {}", path, type, e); } finally { IOUtils.closeQuietly(resourceOrderIStream); } return finalPaths; }
From source file:org.apache.mahout.cf.taste.hadoop.als.eval.InMemoryFactorizationEvaluator.java
License:Apache License
private Matrix readMatrix(Path dir) throws IOException { Matrix matrix = new SparseMatrix(new int[] { Integer.MAX_VALUE, Integer.MAX_VALUE }); FileSystem fs = dir.getFileSystem(getConf()); for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) { Path path = seqFile.getPath(); SequenceFile.Reader reader = null; try {//w w w.j av a2s . c o m reader = new SequenceFile.Reader(fs, path, getConf()); IntWritable key = new IntWritable(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { int row = key.get(); Iterator<Vector.Element> elementsIterator = value.get().iterateNonZero(); while (elementsIterator.hasNext()) { Vector.Element element = elementsIterator.next(); matrix.set(row, element.index(), element.get()); } } } finally { Closeables.closeQuietly(reader); } } return matrix; }
From source file:org.apache.mahout.cf.taste.hadoop.als.eval.InMemoryFactorizationEvaluator.java
License:Apache License
private List<Preference> readProbePreferences(Path dir) throws IOException { List<Preference> preferences = new LinkedList<Preference>(); FileSystem fs = dir.getFileSystem(getConf()); for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) { Path path = seqFile.getPath(); InputStream in = null;/* w ww . j a va 2 s . co m*/ try { in = fs.open(path); BufferedReader reader = new BufferedReader(new InputStreamReader(in, Charset.forName("UTF-8"))); String line; while ((line = reader.readLine()) != null) { String[] tokens = TasteHadoopUtils.splitPrefTokens(line); long userID = Long.parseLong(tokens[0]); long itemID = Long.parseLong(tokens[1]); float value = Float.parseFloat(tokens[2]); preferences.add(new GenericPreference(userID, itemID, value)); } } finally { Closeables.closeQuietly(in); } } return preferences; }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static void loadWeightMatrix(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern, Configuration conf) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); log.info("{}", path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // the key is label,feature while (reader.next(key, value)) { datastore.loadFeatureWeight(key.stringAt(2), key.stringAt(1), value.get()); }/*from w ww . j av a 2 s. c o m*/ } }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static void loadFeatureWeights(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern, Configuration conf) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); log.info("{}", path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // the key is either _label_ or label,feature long count = 0; while (reader.next(key, value)) { // Sum of weights for a Feature if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) { datastore.setSumFeatureWeight(key.stringAt(1), value.get()); count++;// ww w . j a va 2 s.c o m if (count % 50000 == 0) { log.info("Read {} feature weights", count); } } } } }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static void loadLabelWeights(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern, Configuration conf) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); log.info("{}", path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); long count = 0; while (reader.next(key, value)) { // Sum of weights in a Label if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) { datastore.setSumLabelWeight(key.stringAt(1), value.get()); count++;/*w ww .ja va 2s.c o m*/ if (count % 10000 == 0) { log.info("Read {} label weights", count); } } } } }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static void loadThetaNormalizer(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern, Configuration conf) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); log.info("{}", path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); long count = 0; while (reader.next(key, value)) { // Sum of weights in a Label if (key.stringAt(0).equals(BayesConstants.LABEL_THETA_NORMALIZER)) { datastore.setThetaNormalizer(key.stringAt(1), value.get()); count++;/* w ww . java2 s .co m*/ if (count % 50000 == 0) { log.info("Read {} theta norms", count); } } } } }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static void loadSumWeight(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern, Configuration conf) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); log.info("{}", path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // the key is _label while (reader.next(key, value)) { if (key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) { // Sum of // weights for // all Features and all Labels datastore.setSigmaJSigmaK(value.get()); log.info("{}", value.get()); }/*from w ww .java 2 s .com*/ } } }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static Map<String, Double> readLabelSums(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> labelSum = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // the key is either _label_ or label,feature while (reader.next(key, value)) { if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) { // Sum of counts // of labels labelSum.put(key.stringAt(1), value.get()); }//from w ww . java 2s . c o m } } return labelSum; }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static Map<String, Double> readLabelDocumentCounts(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> labelDocumentCounts = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // the key is either _label_ or label,feature while (reader.next(key, value)) { // Count of Documents in a Label if (key.stringAt(0).equals(BayesConstants.LABEL_COUNT)) { labelDocumentCounts.put(key.stringAt(1), value.get()); }//from w w w . j av a 2 s. c o m } } return labelDocumentCounts; }