Example usage for org.apache.mahout.common.iterator FileLineIterable FileLineIterable

List of usage examples for org.apache.mahout.common.iterator FileLineIterable FileLineIterable

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator FileLineIterable FileLineIterable.

Prototype

public FileLineIterable(InputStream is) 

Source Link

Usage

From source file:de.tuberlin.dima.aim.exercises.one.AverageTemperaturePerMonthTest.java

License:Open Source License

private Map<YearAndMonth, Double> readResults(File outputFile) throws IOException {
    Pattern separator = Pattern.compile("\t");
    Map<YearAndMonth, Double> averageTemperatures = Maps.newHashMap();
    for (String line : new FileLineIterable(outputFile)) {
        String[] tokens = separator.split(line);
        int year = Integer.parseInt(tokens[0]);
        int month = Integer.parseInt(tokens[1]);
        double temperature = Double.parseDouble(tokens[2]);
        averageTemperatures.put(new YearAndMonth(year, month), temperature);
    }//from  ww  w  .jav  a 2  s. com
    return averageTemperatures;
}

From source file:de.tuberlin.dima.aim.exercises.one.FilteringWordCountTest.java

License:Open Source License

protected Map<String, Integer> getCounts(File outputFile) throws IOException {
    Map<String, Integer> counts = Maps.newHashMap();
    for (String line : new FileLineIterable(outputFile)) {
        String[] tokens = line.split("\t");
        counts.put(tokens[0], Integer.parseInt(tokens[1]));
    }//from w  w w. ja  v  a2 s.c om
    return counts;
}

From source file:de.tuberlin.dima.aim.exercises.two.BookAndAuthorJoinTest.java

License:Open Source License

Multimap<String, Book> readBooksByAuthors(File outputFile) throws IOException {
    Multimap<String, Book> booksByAuthors = HashMultimap.create();

    Pattern separator = Pattern.compile("\t");
    for (String line : new FileLineIterable(outputFile)) {
        String[] tokens = separator.split(line);
        booksByAuthors.put(tokens[0], new Book(tokens[1], Integer.parseInt(tokens[2])));
    }/*from  ww  w.j  ava2s .com*/
    return booksByAuthors;
}

From source file:de.tuberlin.dima.aim.exercises.two.SecondarySortBookSortTest.java

License:Open Source License

CenturyAndTitle[] asListFromHadoopOut(File file) throws IOException {
    Pattern separator = Pattern.compile("\t");
    List<CenturyAndTitle> centuryAndTitles = Lists.newArrayList();
    for (String line : new FileLineIterable(file)) {
        String[] tokens = separator.split(line);
        centuryAndTitles.add(new CenturyAndTitle(Integer.parseInt(tokens[0]), tokens[1]));
    }//from www. j  a v  a 2  s. com

    return centuryAndTitles.toArray(new CenturyAndTitle[centuryAndTitles.size()]);
}

From source file:de.tuberlin.dima.aim.exercises.two.SecondarySortBookSortTest.java

License:Open Source License

CenturyAndTitle[] asListFromInput(File file) throws IOException {
    Pattern separator = Pattern.compile("\t");
    List<CenturyAndTitle> centuryAndTitles = Lists.newArrayList();
    for (String line : new FileLineIterable(file)) {
        String[] tokens = separator.split(line);
        centuryAndTitles.add(new CenturyAndTitle(Integer.parseInt(tokens[1].substring(0, 2)), tokens[2]));
    }// ww  w  .ja  v a 2s  .com

    return centuryAndTitles.toArray(new CenturyAndTitle[centuryAndTitles.size()]);
}

From source file:de.tuberlin.dima.recsys.ssnmm.ratingprediction.Evaluate.java

License:Apache License

public static void main(String[] args) throws IOException {

    int numUsers = 1823179;
    int numItems = 136736;
    double mu = 3.157255412010664;

    String distributedSimilarityMatrixPath = "/home/ssc/Desktop/yahoo/similarityMatrix/";
    String itemBiasesFilePath = "/home/ssc/Desktop/yahoo/itemBiases.tsv";
    String userBiasesFilePath = "/home/ssc/Desktop/yahoo/userBiases.tsv";
    String trainingSetPath = "/home/ssc/Entwicklung/datasets/yahoo-songs/songs.tsv";
    String holdoutSetPath = "home/ssc/Entwicklung/datasets/yahoo-songs/holdout.tsv";

    Matrix similarities = new SparseRowMatrix(numItems, numItems);

    System.out.println("Reading similarities...");
    int similaritiesRead = 0;
    Configuration conf = new Configuration();
    for (Pair<IntWritable, VectorWritable> pair : new SequenceFileDirIterable<IntWritable, VectorWritable>(
            new Path(distributedSimilarityMatrixPath), PathType.LIST, PathFilters.partFilter(), conf)) {

        int item = pair.getFirst().get();
        Iterator<Vector.Element> elements = pair.getSecond().get().iterateNonZero();

        while (elements.hasNext()) {
            Vector.Element elem = elements.next();
            similarities.setQuick(item, elem.index(), elem.get());
            similaritiesRead++;//w w w  . j av a 2  s . co m
        }
    }
    System.out.println("Found " + similaritiesRead + " similarities");

    Pattern sep = Pattern.compile("\t");

    double[] itemBiases = new double[numItems];
    double[] userBiases = new double[numUsers];

    System.out.println("Reading item biases");
    for (String line : new FileLineIterable(new File(itemBiasesFilePath))) {
        String[] parts = sep.split(line);
        itemBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]);
    }

    System.out.println("Reading user biases");
    for (String line : new FileLineIterable(new File(userBiasesFilePath))) {
        String[] parts = sep.split(line);
        userBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]);
    }

    Iterator<Rating> trainRatings = new RatingsIterable(new File(trainingSetPath)).iterator();
    Iterator<Rating> heldOutRatings = new RatingsIterable(new File(holdoutSetPath)).iterator();

    int currentUser = 0;
    OpenIntDoubleHashMap prefs = new OpenIntDoubleHashMap();

    int usersProcessed = 0;
    RunningAverage rmse = new FullRunningAverage();
    RunningAverage mae = new FullRunningAverage();

    RunningAverage rmseBase = new FullRunningAverage();
    RunningAverage maeBase = new FullRunningAverage();

    while (trainRatings.hasNext()) {
        Rating rating = trainRatings.next();
        if (rating.user() != currentUser) {

            for (int n = 0; n < 10; n++) {
                Rating heldOutRating = heldOutRatings.next();
                Preconditions.checkState(heldOutRating.user() == currentUser);

                double preference = 0.0;
                double totalSimilarity = 0.0;
                int count = 0;

                Iterator<Vector.Element> similarItems = similarities.viewRow(heldOutRating.item())
                        .iterateNonZero();
                while (similarItems.hasNext()) {
                    Vector.Element similarity = similarItems.next();
                    int similarItem = similarity.index();
                    if (prefs.containsKey(similarItem)) {
                        preference += similarity.get() * (prefs.get(similarItem)
                                - (mu + userBiases[currentUser] + itemBiases[similarItem]));
                        totalSimilarity += Math.abs(similarity.get());
                        count++;

                    }
                }

                double baselineEstimate = mu + userBiases[currentUser] + itemBiases[heldOutRating.item()];
                double estimate = baselineEstimate;

                if (count > 1) {
                    estimate += preference / totalSimilarity;
                }

                double baseError = Math.abs(heldOutRating.rating() - baselineEstimate);
                maeBase.addDatum(baseError);
                rmseBase.addDatum(baseError * baseError);

                double error = Math.abs(heldOutRating.rating() - estimate);
                mae.addDatum(error);
                rmse.addDatum(error * error);

            }

            if (++usersProcessed % 10000 == 0) {
                System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE "
                        + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage()
                        + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage()));
            }

            currentUser = rating.user();
            prefs.clear();

        }
        prefs.put(rating.item(), rating.rating());

    }

    System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE "
            + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE "
            + Math.sqrt(rmseBase.getAverage()));
}

From source file:hadoop.api.AggregateAndRecommendReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException {
    Configuration conf = context.getConfiguration();
    recommendationsPerUser = conf.getInt(NUM_RECOMMENDATIONS, DEFAULT_NUM_RECOMMENDATIONS);
    booleanData = conf.getBoolean(RecommenderJob.BOOLEAN_DATA, false);
    indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEMID_INDEX_PATH), conf);

    String itemFilePathString = conf.get(ITEMS_FILE);
    if (itemFilePathString != null) {
        itemsToRecommendFor = new FastIDSet();
        for (String line : new FileLineIterable(HadoopUtil.openStream(new Path(itemFilePathString), conf))) {
            try {
                itemsToRecommendFor.add(Long.parseLong(line));
            } catch (NumberFormatException nfe) {
                log.warn("itemsFile line ignored: {}", line);
            }//from w ww.ja v a 2 s .c  om
        }
    }
}

From source file:io.ssc.relationdiscovery.Utils.java

License:Open Source License

public static Matrix loadOccurrences(File occurrences, int numRows, int numColumns) throws IOException {

    Matrix A = new SparseRowMatrix(numRows, numColumns);

    Pattern splitter = Pattern.compile("\t");
    Pattern splitter2 = Pattern.compile(":");
    for (String line : new FileLineIterable(occurrences)) {
        String[] parts = splitter.split(line);

        if (parts.length > 1) {
            int entityIndex = Integer.parseInt(parts[0]);

            for (int index = 1; index < parts.length; index++) {
                String[] tokens = splitter2.split(parts[index]);
                int patternIndex = Integer.parseInt(tokens[0]);
                double value = Double.parseDouble(tokens[1]);

                A.setQuick(patternIndex - 1, entityIndex - 1, value);
            }/*from ww w. j  a v a2 s  . co  m*/
        }
    }
    return A;
}

From source file:io.ssc.relationdiscovery.Utils.java

License:Open Source License

public static OpenIntObjectHashMap<String> loadLabels(File patternsFile) throws IOException {
    OpenIntObjectHashMap labels = new OpenIntObjectHashMap();
    Pattern splitter = Pattern.compile("\t");
    for (String line : new FileLineIterable(patternsFile)) {
        String[] parts = splitter.split(line);
        labels.put(Integer.parseInt(parts[0]) - 1, parts[1]);
    }//from ww w .j av  a 2 s.  com
    return labels;
}

From source file:net.myrrix.client.ModelBuildTest.java

License:Apache License

@Test
public void testWaitForBuild() throws Exception {
    ClientRecommender client = getClient();
    File testDataDir = new File("testdata/grouplens100K-45/filtered.csv");
    int count = 0;
    for (String line : new FileLineIterable(testDataDir)) {
        client.ingest(new StringReader(line));
        if ((++count % 3) == 0) {
            log.info("Ingested {} users", count);
            client.refresh();//from   w  w  w.  j ava 2s .c  o m
            Thread.sleep(1000L);
            if (count >= FEATURES) {
                assertTrue(client.isReady());
                break;
            } else {
                assertFalse(client.isReady());
            }
        }
    }
}