org.neo4j.examples.imdb.parser.ImdbParser.java Source code

Introduction

Here is the source code for org.neo4j.examples.imdb.parser.ImdbParser.java
Source

/**
 * Licensed to Neo Technology under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Neo Technology licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.neo4j.examples.imdb.parser;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipInputStream;
import org.apache.commons.lang.WordUtils;
import org.graphipedia.dataimport.ProgressCounter;
import org.neo4j.examples.imdb.domain.MovieFormat;
import org.neo4j.examples.imdb.domain.RelTypes;

/**
 * A <code>ImdbParser</code> can parse the movie and actor/actress lists from
 * the imdb text data (http://www.imdb.com/interfaces). It uses an
 * {@link ImdbReader} forwarding the parsed information.
 */
public class ImdbParser {
    private static final String MOVIES_MARKER = "MOVIES LIST";
    private static final int MOVIES_SKIPS = 2;
    private static final String ACTRESSES_MARKER = "THE ACTRESSES LIST";
    private static final int ACTRESS_SKIPS = 4;
    private static final String ACTOR_MARKER = "THE ACTORS LIST";
    private static final int ACTOR_SKIPS = 4;
    private static final int BUFFER_SIZE = 200;
    private final ImdbReader reader;

    /**
     * Create a new Imdb parser.
     * @param reader
     *            reader this parser will use to forward events to
     */
    public ImdbParser(final ImdbReader reader) {
        if (reader == null) {
            throw new IllegalArgumentException("Null ImdbReader");
        }
        this.reader = reader;
    }

    /**
     * Parsers a tab-separated movie list file, each line containing a movie
     * title and the year the movie was released. The file can be .gz or .zip
     * compressed, and must then have the corresponding file extension.
     * @param file
     *            name of movie list file
     * @throws IOException
     *             if unable to open the movie list file
     */
    public String parseMovies(final String file) throws IOException {
        final List<MovieData> buffer = new LinkedList<MovieData>();
        if (file == null) {
            throw new IllegalArgumentException("Null movie file");
        }
        BufferedReader fileReader = getFileReader(file, MOVIES_MARKER, MOVIES_SKIPS);
        String line = fileReader.readLine();
        ProgressCounter movieCount = new ProgressCounter("movies");
        String previousMovieTitle = null;
        while (line != null) {
            // get rid of blank lines, video games, video clip
            if ("".equals(line) || line.indexOf("(VG)") != -1 || line.indexOf("(V)") != -1) {
                line = fileReader.readLine();
                continue;
            }
            final int yearSep = line.indexOf('\t');
            if (yearSep > 0) {
                MovieFormat format = MovieFormat.FILM;
                String title = line.substring(0, yearSep).trim();
                String yearString = line.substring(yearSep).trim();
                if (yearString.length() > 4) {
                    yearString = yearString.substring(0, 4);
                }
                if (title.startsWith("\"")) {
                    format = MovieFormat.SERIE;
                    title = title.replace("\"", "") + " (SERIE)";
                }
                if (line.indexOf("(TV)") != -1) {
                    format = MovieFormat.TV;
                }
                if (yearString.length() == 0 || yearString.charAt(0) == '?' || title.contains("{")
                        || title.startsWith("\"")) {
                    line = fileReader.readLine();
                    continue;
                }
                if (title.equals(previousMovieTitle)) {
                    continue;
                } else {
                    previousMovieTitle = title;
                }
                final int year = Integer.parseInt(yearString);
                buffer.add(new MovieData(title, year, format));
                movieCount.increment();
                if (movieCount.getCount() % BUFFER_SIZE == 0) {
                    reader.newMovies(buffer);
                    buffer.clear();
                }
            }
            line = fileReader.readLine();
        }
        reader.newMovies(buffer);
        return (movieCount.getCount() + " movies parsed and injected.");
    }

    /**
     * Parsers a tab-separated movie rating list file, each line containing a movie
     * title and the year the movie was released. The file can be .gz or .zip
     * compressed, and must then have the corresponding file extension.
     * @param file
     *            name of movie list file
     * @throws IOException
     *             if unable to open the movie list file
     */
    public String parseRatings(final String file) throws IOException {
        if (file == null) {
            throw new IllegalArgumentException("Null rating file");
        }
        BufferedReader fileReader = getFileReader(file, "MOVIE RATINGS REPORT", 2);
        String line = fileReader.readLine();
        ProgressCounter ratingCount = new ProgressCounter("ratings");

        while (line != null && !"".equals(line)) {
            String[] tokens = line.split("  ");
            String title = tokens[tokens.length - 1].trim();
            String rank = tokens[tokens.length - 2].trim();
            String votes = tokens[tokens.length - 3].trim();

            if (title.startsWith("\"")) {
                title = title.replace("\"", "") + " (SERIE)";
            }

            if (title.contains("{") || title.startsWith("\"")) {
                line = fileReader.readLine();
                continue;
            }

            reader.newRating(new RatingData(title, rank, votes));
            ratingCount.increment();

            line = fileReader.readLine();
        }
        return (ratingCount.getCount() + " ratings parsed and injected.");
    }

    /**
     * Parsers a tab-separated actors list file. A line begins with actor name
     * then followed by a tab and a movie title the actor acted in. Additional
     * movies the current actor acted in are found on the following line that
     * starts with a tab followed by the movie title.
     * @param actorFile
     *            name of actor list file
     * @throws IOException
     *             if unable to open actor list file
     */
    public String parseActor(final String actorFile) throws IOException {
        BufferedReader fileReader = getFileReader(actorFile, ACTOR_MARKER, ACTOR_SKIPS);
        return "Actors: " + parsePersonFile(fileReader, RelTypes.ACTOR) + "\n";
    }

    /**
     * Parsers a tab-separated actors list file. A line begins with actor name
     * then followed by a tab and a movie title the actor acted in. Additional
     * movies the current actor acted in are found on the following line that
     * starts with a tab followed by the movie title.
     * 
     * Flush index between actors and actresses is required 
     * because they is an error in the source files where Bishop, Pat (III)
     * appear twice
     * 
     * @param actressFile
     *            name of actor list file
     * @throws IOException
     *             if unable to open actor list file
     */
    public String parseActress(final String actressFile) throws IOException {
        BufferedReader fileReader = getFileReader(actressFile, ACTRESSES_MARKER, ACTRESS_SKIPS);
        return "Actresses: " + parsePersonFile(fileReader, RelTypes.ACTOR);
    }

    public String parsePersonFile(BufferedReader fileReader, RelTypes batchName) throws IOException {

        if (fileReader == null) {
            throw new IllegalArgumentException("Null " + batchName + " file");
        }
        String line = fileReader.readLine();
        String currentActor = null;
        String previousTitleForActor = null;
        final List<PersonData> buffer = new LinkedList<PersonData>();
        final List<RoleData> movies = new ArrayList<RoleData>();
        int movieCount = 0;
        ProgressCounter actorCount = new ProgressCounter(batchName.toString());
        while (line != null) {
            // get rid of blank lines
            if ("".equals(line)) {
                line = fileReader.readLine();
                continue;
            }
            int actorSep = line.indexOf('\t');
            if (actorSep >= 0) {
                String actor = line.substring(0, actorSep).trim();
                actor = WordUtils.capitalizeFully(actor); //There is an error in the source file where Cann, Nathan Maxwell is not capitalised
                if (actor != null && !"".equals(actor) && !actor.equals(currentActor)) {
                    if (movies.size() > 0) {
                        buffer.add(new PersonData(currentActor, movies.toArray(new RoleData[movies.size()])));
                        actorCount.increment();
                        movies.clear();
                        previousTitleForActor = null;
                    }
                    currentActor = actor;
                }
                String title = line.substring(actorSep).trim();
                MovieFormat format = MovieFormat.FILM;

                //normalize title : remove episode name
                if (title.contains("{") && title.contains("}")) {
                    int startEpisodeSep = title.indexOf("{");
                    int endEpisodeSep = title.indexOf("}");
                    title = title.substring(0, startEpisodeSep).trim()
                            + title.substring(endEpisodeSep + 1, title.length());
                }
                if (title.startsWith("\"")) {
                    format = MovieFormat.SERIE;
                    title = title.replace("\"", "");
                }

                if (title.length() == 0 || title.contains("{") || title.startsWith("\"")
                        || title.contains("????")) {
                    line = fileReader.readLine();
                    continue;
                }
                int characterStart = title.indexOf('[');
                int characterEnd = title.indexOf(']');
                String character = null;
                if (characterStart > 0 && characterEnd > characterStart) {
                    character = title.substring(characterStart + 1, characterEnd);
                }
                int creditStart = title.indexOf('<');
                // int creditEnd = title.indexOf( '>' );
                // String credit = null;
                // if ( creditStart > 0 && creditEnd > creditStart )
                // {
                // credit = title.substring( creditStart + 1, creditEnd );
                // }
                if (characterStart > 0) {
                    title = title.substring(0, characterStart).trim();
                } else if (creditStart > 0) {
                    title = title.substring(0, creditStart).trim();
                }
                int spaces = title.indexOf("  ");
                if (spaces > 0) {
                    if (title.charAt(spaces - 1) == ')' && title.charAt(spaces + 2) == '(') {
                        title = title.substring(0, spaces).trim();
                    }
                }
                if (format == MovieFormat.SERIE) {
                    title = title + " (SERIE)";
                }
                if (title.equals(previousTitleForActor)) {
                    line = fileReader.readLine();
                    continue;
                }
                previousTitleForActor = title;

                movies.add(new RoleData(title, batchName, character));
                movieCount++;
                if (movieCount % BUFFER_SIZE == 0) {
                    reader.newPersons(buffer);
                    buffer.clear();
                }
            }
            line = fileReader.readLine();
        }
        reader.newPersons(buffer);
        return (actorCount.getCount() + " " + batchName + " added including " + movieCount
                + " characters parsed and injected.");
    }

    /**
     * Get file reader that corresponds to file extension.
     * @param file
     *            the file name
     * @param pattern
     *            TODO
     * @param skipLines
     *            TODO
     * @return a file reader that uncompresses data if needed
     * @throws IOException
     * @throws FileNotFoundException
     */
    private BufferedReader getFileReader(final String file, String pattern, int skipLines)
            throws IOException, FileNotFoundException {
        BufferedReader fileReader;
        // support compressed files
        if (file.endsWith(".gz")) {
            fileReader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
        } else if (file.endsWith(".zip")) {
            fileReader = new BufferedReader(new InputStreamReader(new ZipInputStream(new FileInputStream(file))));
        } else {
            fileReader = new BufferedReader(new FileReader(file));
        }

        String line = "";
        while (!pattern.equals(line)) {
            line = fileReader.readLine();
        }
        for (int i = 0; i < skipLines; i++) {
            line = fileReader.readLine();
        }

        return fileReader;
    }

    public Object parseDirectors(String filename) throws IOException {
        return parsePersonFile(getFileReader(filename, "THE DIRECTORS LIST", 4), RelTypes.DIRECTOR);
    }

    public Object parseComposers(String filename) throws IOException {
        return parsePersonFile(getFileReader(filename, "THE COMPOSERS LIST", 4), RelTypes.COMPOSER);
    }

    public Object parseProducers(String filename) throws IOException {
        return parsePersonFile(getFileReader(filename, "THE PRODUCERS LIST", 4), RelTypes.PRODUCER);
    }

    public Object parseWriters(String filename) throws IOException {
        return parsePersonFile(getFileReader(filename, "THE WRITERS LIST", 4), RelTypes.WRITER);
    }

    public Object parseGenres(String filename) throws IOException {
        BufferedReader fileReader = getFileReader(filename, "8: THE GENRES LIST", 2);
        String line = fileReader.readLine();
        ProgressCounter genreCount = new ProgressCounter("genres");

        while (line != null && !"".equals(line)) {
            String[] tokens = line.split("\t\t\t\t\t");
            if (tokens.length != 2) {
                line = fileReader.readLine();
                continue;
            }
            String title = tokens[0].trim();
            String genre = tokens[1].trim();
            if (title.startsWith("\"")) {
                title = title.replace("\"", "") + " (SERIE)";
            }
            if (title.contains("{") || title.startsWith("\"")) {
                line = fileReader.readLine();
                continue;
            }

            reader.newGenre(new GenreData(title, genre));
            genreCount.increment();

            line = fileReader.readLine();
        }
        return (genreCount.getCount() + " genres parsed and injected.");
    }

    Object parseKeywords(String filename) throws IOException {
        BufferedReader fileReader = getFileReader(filename, "8: THE KEYWORDS LIST", 2);
        return parseAtributeMultiple(fileReader, "keyword");
    }

    Object parseAtributeMultiple(BufferedReader fileReader, String attributeName) throws IOException {
        String line = fileReader.readLine();
        ProgressCounter keywordsCount = new ProgressCounter(attributeName);
        final List<String> keywords = new LinkedList<String>();
        String previousTitle = null;

        while (line != null && !"".equals(line)) {
            String[] tokens = line.split("\t\t\t\t");
            if (tokens.length != 2) {
                line = fileReader.readLine();
                continue;
            }
            String title = tokens[0].trim();
            String keyword = tokens[1].trim();

            if (title.startsWith("\"")) {
                title = title.replace("\"", "") + " (SERIE)";
            }

            if (title.contains("{") || title.startsWith("\"")) {
                line = fileReader.readLine();
                continue;
            }

            if (previousTitle == null) {
                previousTitle = title;
            }

            if (!title.equals(previousTitle)) {
                reader.newAtributeMultiple(previousTitle, attributeName, keywords);
                keywords.clear();
                previousTitle = title;
            }

            keywords.add(keyword);
            keywordsCount.increment();
            line = fileReader.readLine();
        }
        if (!keywords.isEmpty()) {
            reader.newAtributeMultiple(previousTitle, attributeName, keywords);
        }
        return (keywordsCount.getCount() + " " + attributeName + " parsed and injected.");
    }

    Object parseCinematographer(String filename) throws IOException {
        return parsePersonFile(getFileReader(filename, "THE CINEMATOGRAPHERS LIST", 4), RelTypes.CINEMATOGRAPHER);
    }

    Object parseCountries(String filename) throws IOException {
        BufferedReader fileReader = getFileReader(filename, "COUNTRIES LIST", 2);
        return parseAtributeMultiple(fileReader, "country");
    }

    Object parseLanguages(String filename) throws IOException {
        BufferedReader fileReader = getFileReader(filename, "LANGUAGE LIST", 2);
        return parseAtributeMultiple(fileReader, "language");
    }
}