Example usage for org.apache.mahout.common.iterator FileLineIterator FileLineIterator

List of usage examples for org.apache.mahout.common.iterator FileLineIterator FileLineIterator

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator FileLineIterator FileLineIterator.

Prototype

public FileLineIterator(InputStream is) throws IOException 

Source Link

Usage

From source file:com.anjuke.romar.mahout.similarity.file.RomarFileSimilarityIterator.java

License:Apache License

public static IteratorBuiler<ItemItemSimilarity> lineFileItemIteratorBuilder() {
    return new IteratorBuiler<ItemItemSimilarity>() {
        @Override//  w  w  w .  j av  a2s.c  o  m
        public Iterator<ItemItemSimilarity> build(File file) {
            try {
                return new RomarFileSimilarityIterator<ItemItemSimilarity>(new FileLineIterator(file),
                        new ItemSimilarityBuilder());
            } catch (IOException e) {
                throw new IllegalStateException("Can't read " + file, e);
            }
        }
    };
}

From source file:com.anjuke.romar.mahout.similarity.file.RomarFileSimilarityIterator.java

License:Apache License

public static IteratorBuiler<UserUserSimilarity> lineFileUserIteratorBuilder() {
    return new IteratorBuiler<UserUserSimilarity>() {
        @Override//from  w  ww  .  j ava  2 s.  c  o m
        public Iterator<UserUserSimilarity> build(File file) {
            try {
                return new RomarFileSimilarityIterator<UserUserSimilarity>(new FileLineIterator(file),
                        new UserSimilarityBuilder());
            } catch (IOException e) {
                throw new IllegalStateException("Can't read " + file, e);
            }
        }
    };
}

From source file:de.tuberlin.dima.recsys.ssnmm.RatingsIterable.java

License:Apache License

public RatingsIterable(File ratings) throws IOException {
    Preconditions.checkNotNull(ratings);
    this.rating = new Rating();
    this.lines = new FileLineIterator(ratings);
}

From source file:org.gpfvic.mahout.cf.taste.impl.similarity.file.FileItemItemSimilarityIterator.java

License:Apache License

FileItemItemSimilarityIterator(File similaritiesFile) throws IOException {
    delegate = Iterators.transform(new FileLineIterator(similaritiesFile),
            new Function<String, GenericItemSimilarity.ItemItemSimilarity>() {
                @Override//  w  w w.ja v  a 2 s. c om
                public GenericItemSimilarity.ItemItemSimilarity apply(String from) {
                    String[] tokens = SEPARATOR.split(from);
                    return new GenericItemSimilarity.ItemItemSimilarity(Long.parseLong(tokens[0]),
                            Long.parseLong(tokens[1]), Double.parseDouble(tokens[2]));
                }
            });
}

From source file:org.hmahout.example.NetflixDatasetConverter.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 4) {
        System.err.println("Usage: NetflixDatasetConverter /path/to/training_set/ /path/to/qualifying.txt "
                + "/path/to/judging.txt /path/to/destination");
        return;/*from w  ww  .  j a v a2 s. co m*/
    }

    String trainingDataDir = args[0];
    String qualifyingTxt = args[1];
    String judgingTxt = args[2];
    Path outputPath = new Path(args[3]);

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(outputPath.toUri(), conf);

    log.info("Creating training set at {}/trainingSet/ratings.tsv ...", outputPath);
    BufferedWriter writer = null;
    try {
        FSDataOutputStream outputStream = fs.create(new Path(outputPath, "trainingSet/ratings.tsv"));
        writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charsets.UTF_8));

        int ratingsProcessed = 0;
        for (File movieRatings : new File(trainingDataDir).listFiles()) {
            FileLineIterator lines = null;
            try {
                lines = new FileLineIterator(movieRatings);
                boolean firstLineRead = false;
                String movieID = null;
                while (lines.hasNext()) {
                    String line = lines.next();
                    if (firstLineRead) {
                        String[] tokens = SEPARATOR.split(line);
                        String userID = tokens[0];
                        String rating = tokens[1];
                        writer.write(userID + TAB + movieID + TAB + rating + NEWLINE);
                        ratingsProcessed++;
                        if (ratingsProcessed % 1000000 == 0) {
                            log.info("{} ratings processed...", ratingsProcessed);
                        }
                    } else {
                        movieID = line.replaceAll(MOVIE_DENOTER, "");
                        firstLineRead = true;
                    }
                }
            } finally {
                Closeables.close(lines, true);
            }
        }
        log.info("{} ratings processed. done.", ratingsProcessed);
    } finally {
        Closeables.close(writer, false);
    }

    log.info("Reading probes...");
    List<Preference> probes = Lists.newArrayListWithExpectedSize(2817131);
    long currentMovieID = -1;
    for (String line : new FileLineIterable(new File(qualifyingTxt))) {
        if (line.contains(MOVIE_DENOTER)) {
            currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
        } else {
            long userID = Long.parseLong(SEPARATOR.split(line)[0]);
            probes.add(new GenericPreference(userID, currentMovieID, 0));
        }
    }
    log.info("{} probes read...", probes.size());

    log.info("Reading ratings, creating probe set at {}/probeSet/ratings.tsv ...", outputPath);
    writer = null;
    try {
        FSDataOutputStream outputStream = fs.create(new Path(outputPath, "probeSet/ratings.tsv"));
        writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charsets.UTF_8));

        int ratingsProcessed = 0;
        for (String line : new FileLineIterable(new File(judgingTxt))) {
            if (line.contains(MOVIE_DENOTER)) {
                currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
            } else {
                float rating = Float.parseFloat(SEPARATOR.split(line)[0]);
                Preference pref = probes.get(ratingsProcessed);
                Preconditions.checkState(pref.getItemID() == currentMovieID);
                ratingsProcessed++;
                writer.write(pref.getUserID() + TAB + pref.getItemID() + TAB + rating + NEWLINE);
                if (ratingsProcessed % 1000000 == 0) {
                    log.info("{} ratings processed...", ratingsProcessed);
                }
            }
        }
        log.info("{} ratings processed. done.", ratingsProcessed);
    } finally {
        Closeables.close(writer, false);
    }
}

From source file:org.sleuthkit.hadoop.clustering.ClusterUtil.java

License:Apache License

/**
 * Read in a dictionary file. Format is: First line is the number of entries
 *
 * <pre>// w  ww.j  a va 2s.co m
 * term DocFreq Index
 * </pre>
 */
static String[] loadTermDictionary(InputStream is) throws IOException {
    FileLineIterator it = new FileLineIterator(is);

    int numEntries = Integer.parseInt(it.next());
    // System.out.println(numEntries);
    String[] result = new String[numEntries];

    while (it.hasNext()) {
        String line = it.next();
        if (line.startsWith("#")) {
            continue;
        }
        String[] tokens = ClusterUtil.TAB_PATTERN.split(line);
        if (tokens.length < 3) {
            continue;
        }
        int index = Integer.parseInt(tokens[2]); // tokens[1] is the doc freq
        result[index] = tokens[0];
    }
    return result;
}