Example usage for org.apache.mahout.common.iterator FileLineIterator FileLineIterator

List of usage examples for org.apache.mahout.common.iterator FileLineIterator FileLineIterator

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator FileLineIterator FileLineIterator.

Prototype

public FileLineIterator(InputStream is, boolean skipFirstLine) throws IOException 

Source Link

Usage

From source file:org.gpfvic.mahout.cf.taste.impl.model.file.FileDataModel.java

License:Apache License

/**
 * @param delimiterRegex If your data file don't use '\t' or ',' as delimiters, you can specify 
 * user own using regex pattern./*  w  w  w  . jav  a 2  s . c om*/
 * @throws IOException
 */
public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS, String delimiterRegex)
        throws IOException {

    this.dataFile = Preconditions.checkNotNull(dataFile.getAbsoluteFile());
    if (!dataFile.exists() || dataFile.isDirectory()) {
        throw new FileNotFoundException(dataFile.toString());
    }
    Preconditions.checkArgument(dataFile.length() > 0L, "dataFile is empty");
    Preconditions.checkArgument(minReloadIntervalMS >= 0L, "minReloadIntervalMs must be non-negative");

    log.info("Creating FileDataModel for file {}", dataFile);

    this.lastModified = dataFile.lastModified();
    this.lastUpdateFileModified = readLastUpdateFileModified();

    FileLineIterator iterator = new FileLineIterator(dataFile, false);
    String firstLine = iterator.peek();
    while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) {
        iterator.next();
        firstLine = iterator.peek();
    }
    Closeables.close(iterator, true);

    char delimiter;
    if (delimiterRegex == null) {
        delimiter = determineDelimiter(firstLine);
        delimiterPattern = Splitter.on(delimiter);
    } else {
        delimiter = '\0';
        delimiterPattern = Splitter.onPattern(delimiterRegex);
        if (!delimiterPattern.split(firstLine).iterator().hasNext()) {
            throw new IllegalArgumentException("Did not find a delimiter(pattern) in first line");
        }
    }
    List<String> firstLineSplit = new ArrayList<>();
    for (String token : delimiterPattern.split(firstLine)) {
        firstLineSplit.add(token);
    }
    // If preference value exists and isn't empty then the file is specifying pref values
    hasPrefValues = firstLineSplit.size() >= 3 && !firstLineSplit.get(2).isEmpty();

    this.reloadLock = new ReentrantLock();
    this.transpose = transpose;
    this.minReloadIntervalMS = minReloadIntervalMS;

    reload();
}

From source file:org.gpfvic.mahout.cf.taste.impl.model.file.FileDataModel.java

License:Apache License

protected DataModel buildModel() throws IOException {

    long newLastModified = dataFile.lastModified();
    long newLastUpdateFileModified = readLastUpdateFileModified();

    boolean loadFreshData = delegate == null || newLastModified > lastModified + minReloadIntervalMS;

    long oldLastUpdateFileModifieid = lastUpdateFileModified;
    lastModified = newLastModified;//from w w w .  j  a v a2 s  . c  om
    lastUpdateFileModified = newLastUpdateFileModified;

    FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>();

    if (hasPrefValues) {

        if (loadFreshData) {

            FastByIDMap<Collection<Preference>> data = new FastByIDMap<>();
            FileLineIterator iterator = new FileLineIterator(dataFile, false);
            processFile(iterator, data, timestamps, false);

            for (File updateFile : findUpdateFilesAfter(newLastModified)) {
                processFile(new FileLineIterator(updateFile, false), data, timestamps, false);
            }

            return new GenericDataModel(GenericDataModel.toDataMap(data, true), timestamps);

        } else {

            FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData();

            for (File updateFile : findUpdateFilesAfter(
                    Math.max(oldLastUpdateFileModifieid, newLastModified))) {
                processFile(new FileLineIterator(updateFile, false), rawData, timestamps, true);
            }

            return new GenericDataModel(rawData, timestamps);

        }

    } else {

        if (loadFreshData) {

            FastByIDMap<FastIDSet> data = new FastByIDMap<>();
            FileLineIterator iterator = new FileLineIterator(dataFile, false);
            processFileWithoutID(iterator, data, timestamps);

            for (File updateFile : findUpdateFilesAfter(newLastModified)) {
                processFileWithoutID(new FileLineIterator(updateFile, false), data, timestamps);
            }

            return new GenericBooleanPrefDataModel(data, timestamps);

        } else {

            FastByIDMap<FastIDSet> rawData = ((GenericBooleanPrefDataModel) delegate).getRawUserData();

            for (File updateFile : findUpdateFilesAfter(
                    Math.max(oldLastUpdateFileModifieid, newLastModified))) {
                processFileWithoutID(new FileLineIterator(updateFile, false), rawData, timestamps);
            }

            return new GenericBooleanPrefDataModel(rawData, timestamps);

        }

    }
}