List of usage examples for org.apache.mahout.common.iterator FileLineIterator FileLineIterator
public FileLineIterator(InputStream is, boolean skipFirstLine) throws IOException
From source file:org.gpfvic.mahout.cf.taste.impl.model.file.FileDataModel.java
License:Apache License
/** * @param delimiterRegex If your data file don't use '\t' or ',' as delimiters, you can specify * user own using regex pattern./* w w w . jav a 2 s . c om*/ * @throws IOException */ public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS, String delimiterRegex) throws IOException { this.dataFile = Preconditions.checkNotNull(dataFile.getAbsoluteFile()); if (!dataFile.exists() || dataFile.isDirectory()) { throw new FileNotFoundException(dataFile.toString()); } Preconditions.checkArgument(dataFile.length() > 0L, "dataFile is empty"); Preconditions.checkArgument(minReloadIntervalMS >= 0L, "minReloadIntervalMs must be non-negative"); log.info("Creating FileDataModel for file {}", dataFile); this.lastModified = dataFile.lastModified(); this.lastUpdateFileModified = readLastUpdateFileModified(); FileLineIterator iterator = new FileLineIterator(dataFile, false); String firstLine = iterator.peek(); while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) { iterator.next(); firstLine = iterator.peek(); } Closeables.close(iterator, true); char delimiter; if (delimiterRegex == null) { delimiter = determineDelimiter(firstLine); delimiterPattern = Splitter.on(delimiter); } else { delimiter = '\0'; delimiterPattern = Splitter.onPattern(delimiterRegex); if (!delimiterPattern.split(firstLine).iterator().hasNext()) { throw new IllegalArgumentException("Did not find a delimiter(pattern) in first line"); } } List<String> firstLineSplit = new ArrayList<>(); for (String token : delimiterPattern.split(firstLine)) { firstLineSplit.add(token); } // If preference value exists and isn't empty then the file is specifying pref values hasPrefValues = firstLineSplit.size() >= 3 && !firstLineSplit.get(2).isEmpty(); this.reloadLock = new ReentrantLock(); this.transpose = transpose; this.minReloadIntervalMS = minReloadIntervalMS; reload(); }
From source file:org.gpfvic.mahout.cf.taste.impl.model.file.FileDataModel.java
License:Apache License
protected DataModel buildModel() throws IOException { long newLastModified = dataFile.lastModified(); long newLastUpdateFileModified = readLastUpdateFileModified(); boolean loadFreshData = delegate == null || newLastModified > lastModified + minReloadIntervalMS; long oldLastUpdateFileModifieid = lastUpdateFileModified; lastModified = newLastModified;//from w w w . j a v a2 s . c om lastUpdateFileModified = newLastUpdateFileModified; FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>(); if (hasPrefValues) { if (loadFreshData) { FastByIDMap<Collection<Preference>> data = new FastByIDMap<>(); FileLineIterator iterator = new FileLineIterator(dataFile, false); processFile(iterator, data, timestamps, false); for (File updateFile : findUpdateFilesAfter(newLastModified)) { processFile(new FileLineIterator(updateFile, false), data, timestamps, false); } return new GenericDataModel(GenericDataModel.toDataMap(data, true), timestamps); } else { FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData(); for (File updateFile : findUpdateFilesAfter( Math.max(oldLastUpdateFileModifieid, newLastModified))) { processFile(new FileLineIterator(updateFile, false), rawData, timestamps, true); } return new GenericDataModel(rawData, timestamps); } } else { if (loadFreshData) { FastByIDMap<FastIDSet> data = new FastByIDMap<>(); FileLineIterator iterator = new FileLineIterator(dataFile, false); processFileWithoutID(iterator, data, timestamps); for (File updateFile : findUpdateFilesAfter(newLastModified)) { processFileWithoutID(new FileLineIterator(updateFile, false), data, timestamps); } return new GenericBooleanPrefDataModel(data, timestamps); } else { FastByIDMap<FastIDSet> rawData = ((GenericBooleanPrefDataModel) delegate).getRawUserData(); for (File updateFile : findUpdateFilesAfter( Math.max(oldLastUpdateFileModifieid, newLastModified))) { processFileWithoutID(new FileLineIterator(updateFile, false), rawData, timestamps); } return new GenericBooleanPrefDataModel(rawData, timestamps); } } }