List of usage examples for org.apache.mahout.common.iterator FileLineIterable FileLineIterable
public FileLineIterable(InputStream is)
From source file:de.tuberlin.dima.aim.exercises.one.AverageTemperaturePerMonthTest.java
License:Open Source License
private Map<YearAndMonth, Double> readResults(File outputFile) throws IOException { Pattern separator = Pattern.compile("\t"); Map<YearAndMonth, Double> averageTemperatures = Maps.newHashMap(); for (String line : new FileLineIterable(outputFile)) { String[] tokens = separator.split(line); int year = Integer.parseInt(tokens[0]); int month = Integer.parseInt(tokens[1]); double temperature = Double.parseDouble(tokens[2]); averageTemperatures.put(new YearAndMonth(year, month), temperature); }//from ww w .jav a 2 s. com return averageTemperatures; }
From source file:de.tuberlin.dima.aim.exercises.one.FilteringWordCountTest.java
License:Open Source License
protected Map<String, Integer> getCounts(File outputFile) throws IOException { Map<String, Integer> counts = Maps.newHashMap(); for (String line : new FileLineIterable(outputFile)) { String[] tokens = line.split("\t"); counts.put(tokens[0], Integer.parseInt(tokens[1])); }//from w w w. ja v a2 s.c om return counts; }
From source file:de.tuberlin.dima.aim.exercises.two.BookAndAuthorJoinTest.java
License:Open Source License
Multimap<String, Book> readBooksByAuthors(File outputFile) throws IOException { Multimap<String, Book> booksByAuthors = HashMultimap.create(); Pattern separator = Pattern.compile("\t"); for (String line : new FileLineIterable(outputFile)) { String[] tokens = separator.split(line); booksByAuthors.put(tokens[0], new Book(tokens[1], Integer.parseInt(tokens[2]))); }/*from ww w.j ava2s .com*/ return booksByAuthors; }
From source file:de.tuberlin.dima.aim.exercises.two.SecondarySortBookSortTest.java
License:Open Source License
CenturyAndTitle[] asListFromHadoopOut(File file) throws IOException { Pattern separator = Pattern.compile("\t"); List<CenturyAndTitle> centuryAndTitles = Lists.newArrayList(); for (String line : new FileLineIterable(file)) { String[] tokens = separator.split(line); centuryAndTitles.add(new CenturyAndTitle(Integer.parseInt(tokens[0]), tokens[1])); }//from www. j a v a 2 s. com return centuryAndTitles.toArray(new CenturyAndTitle[centuryAndTitles.size()]); }
From source file:de.tuberlin.dima.aim.exercises.two.SecondarySortBookSortTest.java
License:Open Source License
CenturyAndTitle[] asListFromInput(File file) throws IOException { Pattern separator = Pattern.compile("\t"); List<CenturyAndTitle> centuryAndTitles = Lists.newArrayList(); for (String line : new FileLineIterable(file)) { String[] tokens = separator.split(line); centuryAndTitles.add(new CenturyAndTitle(Integer.parseInt(tokens[1].substring(0, 2)), tokens[2])); }// ww w .ja v a 2s .com return centuryAndTitles.toArray(new CenturyAndTitle[centuryAndTitles.size()]); }
From source file:de.tuberlin.dima.recsys.ssnmm.ratingprediction.Evaluate.java
License:Apache License
public static void main(String[] args) throws IOException { int numUsers = 1823179; int numItems = 136736; double mu = 3.157255412010664; String distributedSimilarityMatrixPath = "/home/ssc/Desktop/yahoo/similarityMatrix/"; String itemBiasesFilePath = "/home/ssc/Desktop/yahoo/itemBiases.tsv"; String userBiasesFilePath = "/home/ssc/Desktop/yahoo/userBiases.tsv"; String trainingSetPath = "/home/ssc/Entwicklung/datasets/yahoo-songs/songs.tsv"; String holdoutSetPath = "home/ssc/Entwicklung/datasets/yahoo-songs/holdout.tsv"; Matrix similarities = new SparseRowMatrix(numItems, numItems); System.out.println("Reading similarities..."); int similaritiesRead = 0; Configuration conf = new Configuration(); for (Pair<IntWritable, VectorWritable> pair : new SequenceFileDirIterable<IntWritable, VectorWritable>( new Path(distributedSimilarityMatrixPath), PathType.LIST, PathFilters.partFilter(), conf)) { int item = pair.getFirst().get(); Iterator<Vector.Element> elements = pair.getSecond().get().iterateNonZero(); while (elements.hasNext()) { Vector.Element elem = elements.next(); similarities.setQuick(item, elem.index(), elem.get()); similaritiesRead++;//w w w . j av a 2 s . co m } } System.out.println("Found " + similaritiesRead + " similarities"); Pattern sep = Pattern.compile("\t"); double[] itemBiases = new double[numItems]; double[] userBiases = new double[numUsers]; System.out.println("Reading item biases"); for (String line : new FileLineIterable(new File(itemBiasesFilePath))) { String[] parts = sep.split(line); itemBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]); } System.out.println("Reading user biases"); for (String line : new FileLineIterable(new File(userBiasesFilePath))) { String[] parts = sep.split(line); userBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]); } Iterator<Rating> trainRatings = new RatingsIterable(new File(trainingSetPath)).iterator(); Iterator<Rating> heldOutRatings = new RatingsIterable(new File(holdoutSetPath)).iterator(); int currentUser = 0; OpenIntDoubleHashMap prefs = new OpenIntDoubleHashMap(); int usersProcessed = 0; RunningAverage rmse = new FullRunningAverage(); RunningAverage mae = new FullRunningAverage(); RunningAverage rmseBase = new FullRunningAverage(); RunningAverage maeBase = new FullRunningAverage(); while (trainRatings.hasNext()) { Rating rating = trainRatings.next(); if (rating.user() != currentUser) { for (int n = 0; n < 10; n++) { Rating heldOutRating = heldOutRatings.next(); Preconditions.checkState(heldOutRating.user() == currentUser); double preference = 0.0; double totalSimilarity = 0.0; int count = 0; Iterator<Vector.Element> similarItems = similarities.viewRow(heldOutRating.item()) .iterateNonZero(); while (similarItems.hasNext()) { Vector.Element similarity = similarItems.next(); int similarItem = similarity.index(); if (prefs.containsKey(similarItem)) { preference += similarity.get() * (prefs.get(similarItem) - (mu + userBiases[currentUser] + itemBiases[similarItem])); totalSimilarity += Math.abs(similarity.get()); count++; } } double baselineEstimate = mu + userBiases[currentUser] + itemBiases[heldOutRating.item()]; double estimate = baselineEstimate; if (count > 1) { estimate += preference / totalSimilarity; } double baseError = Math.abs(heldOutRating.rating() - baselineEstimate); maeBase.addDatum(baseError); rmseBase.addDatum(baseError * baseError); double error = Math.abs(heldOutRating.rating() - estimate); mae.addDatum(error); rmse.addDatum(error * error); } if (++usersProcessed % 10000 == 0) { System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage())); } currentUser = rating.user(); prefs.clear(); } prefs.put(rating.item(), rating.rating()); } System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage())); }
From source file:hadoop.api.AggregateAndRecommendReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException { Configuration conf = context.getConfiguration(); recommendationsPerUser = conf.getInt(NUM_RECOMMENDATIONS, DEFAULT_NUM_RECOMMENDATIONS); booleanData = conf.getBoolean(RecommenderJob.BOOLEAN_DATA, false); indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEMID_INDEX_PATH), conf); String itemFilePathString = conf.get(ITEMS_FILE); if (itemFilePathString != null) { itemsToRecommendFor = new FastIDSet(); for (String line : new FileLineIterable(HadoopUtil.openStream(new Path(itemFilePathString), conf))) { try { itemsToRecommendFor.add(Long.parseLong(line)); } catch (NumberFormatException nfe) { log.warn("itemsFile line ignored: {}", line); }//from w ww.ja v a 2 s .c om } } }
From source file:io.ssc.relationdiscovery.Utils.java
License:Open Source License
public static Matrix loadOccurrences(File occurrences, int numRows, int numColumns) throws IOException { Matrix A = new SparseRowMatrix(numRows, numColumns); Pattern splitter = Pattern.compile("\t"); Pattern splitter2 = Pattern.compile(":"); for (String line : new FileLineIterable(occurrences)) { String[] parts = splitter.split(line); if (parts.length > 1) { int entityIndex = Integer.parseInt(parts[0]); for (int index = 1; index < parts.length; index++) { String[] tokens = splitter2.split(parts[index]); int patternIndex = Integer.parseInt(tokens[0]); double value = Double.parseDouble(tokens[1]); A.setQuick(patternIndex - 1, entityIndex - 1, value); }/*from ww w. j a v a2 s . co m*/ } } return A; }
From source file:io.ssc.relationdiscovery.Utils.java
License:Open Source License
public static OpenIntObjectHashMap<String> loadLabels(File patternsFile) throws IOException { OpenIntObjectHashMap labels = new OpenIntObjectHashMap(); Pattern splitter = Pattern.compile("\t"); for (String line : new FileLineIterable(patternsFile)) { String[] parts = splitter.split(line); labels.put(Integer.parseInt(parts[0]) - 1, parts[1]); }//from ww w .j av a 2 s. com return labels; }
From source file:net.myrrix.client.ModelBuildTest.java
License:Apache License
@Test public void testWaitForBuild() throws Exception { ClientRecommender client = getClient(); File testDataDir = new File("testdata/grouplens100K-45/filtered.csv"); int count = 0; for (String line : new FileLineIterable(testDataDir)) { client.ingest(new StringReader(line)); if ((++count % 3) == 0) { log.info("Ingested {} users", count); client.refresh();//from w w w. j ava 2s .c o m Thread.sleep(1000L); if (count >= FEATURES) { assertTrue(client.isReady()); break; } else { assertFalse(client.isReady()); } } } }