org.apache.mahout.clustering.minhash.LastfmDataConverter.java Source code

Introduction

Here is the source code for org.apache.mahout.clustering.minhash.LastfmDataConverter.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.minhash;

import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

public final class LastfmDataConverter {

    private static final Pattern TAB_PATTERN = Pattern.compile("\t");

    // we are clustering similar featureIdxs on the following dataset
    // http://www.iua.upf.es/~ocelma/MusicRecommendationDataset/index.html
    //
    // Preparation of the data set means gettting the dataset to a format which
    // can
    // be read by the min hash algorithm;
    //
    enum Lastfm {
        USERS_360K(17559530), USERS_1K(19150868);
        private final int totalRecords;

        Lastfm(int totalRecords) {
            this.totalRecords = totalRecords;
        }

        int getTotalRecords() {
            return totalRecords;
        }
    }

    private LastfmDataConverter() {
    }

    private static String usedMemory() {
        Runtime runtime = Runtime.getRuntime();
        return "Used Memory: [" + (runtime.totalMemory() - runtime.freeMemory()) / (1024 * 1024) + " MB] ";
    }

    /* Get the feature from the parsed record */
    private static String getFeature(String[] fields, Lastfm dataSet) {
        if (dataSet == Lastfm.USERS_360K) {
            return fields[0];
        } else {
            return fields[2];
        }
    }

    /* Get the item from the parsed record */
    private static String getItem(String[] fields, Lastfm dataSet) {
        if (dataSet == Lastfm.USERS_360K) {
            return fields[2];
        } else {
            return fields[0];
        }
    }

    /**
     * Reads the LastFm dataset and constructs a Map of (item, features). For 360K
     * Users dataset - (Item=Artist, Feature=User) For 1K Users dataset -
     * (Item=User, Feature=Artist)
     * 
     * @param inputFile
     *          Lastfm dataset file on the local file system.
     * @param dataSet
     *          Type of dataset - 360K Users or 1K Users
     */
    public static Map<String, List<Integer>> convertToItemFeatures(String inputFile, Lastfm dataSet)
            throws IOException {
        long totalRecords = dataSet.getTotalRecords();
        Map<String, Integer> featureIdxMap = Maps.newHashMap();
        Map<String, List<Integer>> itemFeaturesMap = Maps.newHashMap();
        String msg = usedMemory() + "Converting data to internal vector format: ";
        BufferedReader br = Files.newReader(new File(inputFile), Charsets.UTF_8);
        try {
            System.out.print(msg);
            int prevPercentDone = 1;
            double percentDone = 0.0;
            long parsedRecords = 0;
            String line;
            while ((line = br.readLine()) != null) {
                String[] fields = TAB_PATTERN.split(line);
                String feature = getFeature(fields, dataSet);
                String item = getItem(fields, dataSet);
                // get the featureIdx
                Integer featureIdx = featureIdxMap.get(feature);
                if (featureIdx == null) {
                    featureIdx = featureIdxMap.size() + 1;
                    featureIdxMap.put(feature, featureIdx);
                }
                // add it to the corresponding feature idx map
                List<Integer> features = itemFeaturesMap.get(item);
                if (features == null) {
                    features = Lists.newArrayList();
                    itemFeaturesMap.put(item, features);
                }
                features.add(featureIdx);
                parsedRecords++;
                // Update the progress
                percentDone = parsedRecords * 100.0 / totalRecords;
                msg = usedMemory() + "Converting data to internal vector format: ";
                if (percentDone > prevPercentDone) {
                    System.out.print('\r' + msg + percentDone + '%');
                    prevPercentDone++;
                }
                parsedRecords++;
            }
            msg = usedMemory() + "Converting data to internal vector format: ";
            System.out.print('\r' + msg + percentDone + "% Completed\n");
        } finally {
            Closeables.closeQuietly(br);
        }
        return itemFeaturesMap;
    }

    /**
     * Converts each record in (item,features) map into Mahout vector format and
     * writes it into sequencefile for minhash clustering
     */
    public static boolean writeToSequenceFile(Map<String, List<Integer>> itemFeaturesMap, Path outputPath)
            throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        fs.mkdirs(outputPath.getParent());
        long totalRecords = itemFeaturesMap.size();
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputPath, Text.class,
                VectorWritable.class);
        try {
            String msg = "Now writing vectorized data in sequence file format: ";
            System.out.print(msg);

            Text itemWritable = new Text();
            VectorWritable featuresWritable = new VectorWritable();

            int doneRecords = 0;
            int prevPercentDone = 1;

            for (Map.Entry<String, List<Integer>> itemFeature : itemFeaturesMap.entrySet()) {
                int numfeatures = itemFeature.getValue().size();
                itemWritable.set(itemFeature.getKey());
                Vector featureVector = new SequentialAccessSparseVector(numfeatures);
                int i = 0;
                for (Integer feature : itemFeature.getValue()) {
                    featureVector.setQuick(i++, feature);
                }
                featuresWritable.set(featureVector);
                writer.append(itemWritable, featuresWritable);
                // Update the progress
                double percentDone = ++doneRecords * 100.0 / totalRecords;
                if (percentDone > prevPercentDone) {
                    System.out.print('\r' + msg + percentDone + "% " + (percentDone >= 100 ? "Completed\n" : ""));
                    prevPercentDone++;
                }
            }
        } finally {
            Closeables.closeQuietly(writer);
        }
        return true;
    }

    public static void main(String[] args) throws Exception {
        if (args.length < 3) {
            System.out.println("[Usage]: LastfmDataConverter <input> <output> <dataset>");
            System.out.println("   <input>: Absolute path to the local file [usersha1-artmbid-artname-plays.tsv] ");
            System.out.println("  <output>: Absolute path to the HDFS output file");
            System.out.println(" <dataset>: Either of the two Lastfm public datasets. "
                    + "Must be either 'Users360K' or 'Users1K'");
            System.out.println("Note:- Hadoop configuration pointing to HDFS namenode should be in classpath");
            return;
        }
        Lastfm dataSet = Lastfm.valueOf(args[2]);
        Map<String, List<Integer>> itemFeatures = convertToItemFeatures(args[0], dataSet);
        if (itemFeatures.isEmpty()) {
            throw new IllegalStateException("Error converting the data file: [" + args[0] + ']');
        }
        Path output = new Path(args[1]);
        boolean status = writeToSequenceFile(itemFeatures, output);
        if (status) {
            System.out.println("Data converted and written successfully to HDFS location: [" + output + ']');
        } else {
            System.err.println("Error writing the converted data to HDFS location: [" + output + ']');
        }
    }
}