List of usage examples for org.apache.mahout.common.iterator FileLineIterator FileLineIterator
public FileLineIterator(InputStream is) throws IOException
From source file:com.anjuke.romar.mahout.similarity.file.RomarFileSimilarityIterator.java
License:Apache License
public static IteratorBuiler<ItemItemSimilarity> lineFileItemIteratorBuilder() { return new IteratorBuiler<ItemItemSimilarity>() { @Override// w w w . j av a2s.c o m public Iterator<ItemItemSimilarity> build(File file) { try { return new RomarFileSimilarityIterator<ItemItemSimilarity>(new FileLineIterator(file), new ItemSimilarityBuilder()); } catch (IOException e) { throw new IllegalStateException("Can't read " + file, e); } } }; }
From source file:com.anjuke.romar.mahout.similarity.file.RomarFileSimilarityIterator.java
License:Apache License
public static IteratorBuiler<UserUserSimilarity> lineFileUserIteratorBuilder() { return new IteratorBuiler<UserUserSimilarity>() { @Override//from w ww . j ava 2 s. c o m public Iterator<UserUserSimilarity> build(File file) { try { return new RomarFileSimilarityIterator<UserUserSimilarity>(new FileLineIterator(file), new UserSimilarityBuilder()); } catch (IOException e) { throw new IllegalStateException("Can't read " + file, e); } } }; }
From source file:de.tuberlin.dima.recsys.ssnmm.RatingsIterable.java
License:Apache License
public RatingsIterable(File ratings) throws IOException { Preconditions.checkNotNull(ratings); this.rating = new Rating(); this.lines = new FileLineIterator(ratings); }
From source file:org.gpfvic.mahout.cf.taste.impl.similarity.file.FileItemItemSimilarityIterator.java
License:Apache License
FileItemItemSimilarityIterator(File similaritiesFile) throws IOException { delegate = Iterators.transform(new FileLineIterator(similaritiesFile), new Function<String, GenericItemSimilarity.ItemItemSimilarity>() { @Override// w w w.ja v a 2 s. c om public GenericItemSimilarity.ItemItemSimilarity apply(String from) { String[] tokens = SEPARATOR.split(from); return new GenericItemSimilarity.ItemItemSimilarity(Long.parseLong(tokens[0]), Long.parseLong(tokens[1]), Double.parseDouble(tokens[2])); } }); }
From source file:org.hmahout.example.NetflixDatasetConverter.java
License:Apache License
public static void main(String[] args) throws IOException { if (args.length != 4) { System.err.println("Usage: NetflixDatasetConverter /path/to/training_set/ /path/to/qualifying.txt " + "/path/to/judging.txt /path/to/destination"); return;/*from w ww . j a v a2 s. co m*/ } String trainingDataDir = args[0]; String qualifyingTxt = args[1]; String judgingTxt = args[2]; Path outputPath = new Path(args[3]); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); log.info("Creating training set at {}/trainingSet/ratings.tsv ...", outputPath); BufferedWriter writer = null; try { FSDataOutputStream outputStream = fs.create(new Path(outputPath, "trainingSet/ratings.tsv")); writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charsets.UTF_8)); int ratingsProcessed = 0; for (File movieRatings : new File(trainingDataDir).listFiles()) { FileLineIterator lines = null; try { lines = new FileLineIterator(movieRatings); boolean firstLineRead = false; String movieID = null; while (lines.hasNext()) { String line = lines.next(); if (firstLineRead) { String[] tokens = SEPARATOR.split(line); String userID = tokens[0]; String rating = tokens[1]; writer.write(userID + TAB + movieID + TAB + rating + NEWLINE); ratingsProcessed++; if (ratingsProcessed % 1000000 == 0) { log.info("{} ratings processed...", ratingsProcessed); } } else { movieID = line.replaceAll(MOVIE_DENOTER, ""); firstLineRead = true; } } } finally { Closeables.close(lines, true); } } log.info("{} ratings processed. done.", ratingsProcessed); } finally { Closeables.close(writer, false); } log.info("Reading probes..."); List<Preference> probes = Lists.newArrayListWithExpectedSize(2817131); long currentMovieID = -1; for (String line : new FileLineIterable(new File(qualifyingTxt))) { if (line.contains(MOVIE_DENOTER)) { currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, "")); } else { long userID = Long.parseLong(SEPARATOR.split(line)[0]); probes.add(new GenericPreference(userID, currentMovieID, 0)); } } log.info("{} probes read...", probes.size()); log.info("Reading ratings, creating probe set at {}/probeSet/ratings.tsv ...", outputPath); writer = null; try { FSDataOutputStream outputStream = fs.create(new Path(outputPath, "probeSet/ratings.tsv")); writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charsets.UTF_8)); int ratingsProcessed = 0; for (String line : new FileLineIterable(new File(judgingTxt))) { if (line.contains(MOVIE_DENOTER)) { currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, "")); } else { float rating = Float.parseFloat(SEPARATOR.split(line)[0]); Preference pref = probes.get(ratingsProcessed); Preconditions.checkState(pref.getItemID() == currentMovieID); ratingsProcessed++; writer.write(pref.getUserID() + TAB + pref.getItemID() + TAB + rating + NEWLINE); if (ratingsProcessed % 1000000 == 0) { log.info("{} ratings processed...", ratingsProcessed); } } } log.info("{} ratings processed. done.", ratingsProcessed); } finally { Closeables.close(writer, false); } }
From source file:org.sleuthkit.hadoop.clustering.ClusterUtil.java
License:Apache License
/** * Read in a dictionary file. Format is: First line is the number of entries * * <pre>// w ww.j a va 2s.co m * term DocFreq Index * </pre> */ static String[] loadTermDictionary(InputStream is) throws IOException { FileLineIterator it = new FileLineIterator(is); int numEntries = Integer.parseInt(it.next()); // System.out.println(numEntries); String[] result = new String[numEntries]; while (it.hasNext()) { String line = it.next(); if (line.startsWith("#")) { continue; } String[] tokens = ClusterUtil.TAB_PATTERN.split(line); if (tokens.length < 3) { continue; } int index = Integer.parseInt(tokens[2]); // tokens[1] is the doc freq result[index] = tokens[0]; } return result; }