org.eclipse.rdf4j.rio.turtle.TurtleParser.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.rdf4j.rio.turtle.TurtleParser.java

Source

/*******************************************************************************
 * Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *******************************************************************************/
package org.eclipse.rdf4j.rio.turtle;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.io.input.BOMInputStream;
import org.eclipse.rdf4j.common.text.ASCIIUtil;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RioSetting;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;
import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
import org.eclipse.rdf4j.rio.helpers.TurtleParserSettings;

/**
 * RDF parser for <a href="https://www.w3.org/TR/turtle/">RDF-1.1 Turtle</a>
 * files. This parser is not thread-safe, therefore its public methods are
 * synchronized.
 * <p>
 * <li>Normalization of integer, floating point and boolean values is dependent
 * on the specified datatype handling. According to the specification, integers
 * and booleans should be normalized, but floats don't.</li>
 * <li>Comments can be used anywhere in the document, and extend to the end of
 * the line. The Turtle grammar doesn't allow comments to be used inside triple
 * constructs that extend over multiple lines, but the author's own parser
 * deviates from this too.</li>
 * </ul>
 * 
 * @author Arjohn Kampman
 * @author Peter Ansell
 */
public class TurtleParser extends AbstractRDFParser {

    /*-----------*
     * Variables *
     *-----------*/

    private PushbackReader reader;

    protected Resource subject;

    protected IRI predicate;

    protected Value object;

    private int lineNumber = 1;
    private final StringBuilder parsingBuilder = new StringBuilder();

    /*--------------*
     * Constructors *
     *--------------*/

    /**
     * Creates a new TurtleParser that will use a {@link SimpleValueFactory} to
     * create RDF model objects.
     */
    public TurtleParser() {
        super();
    }

    /**
     * Creates a new TurtleParser that will use the supplied ValueFactory to
     * create RDF model objects.
     *
     * @param valueFactory
     *            A ValueFactory.
     */
    public TurtleParser(ValueFactory valueFactory) {
        super(valueFactory);
    }

    /*---------*
     * Methods *
     *---------*/

    public RDFFormat getRDFFormat() {
        return RDFFormat.TURTLE;
    }

    @Override
    public Collection<RioSetting<?>> getSupportedSettings() {
        Set<RioSetting<?>> result = new HashSet<RioSetting<?>>(super.getSupportedSettings());
        result.add(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES);
        return result;
    }

    /**
     * Implementation of the <tt>parse(InputStream, String)</tt> method defined
     * in the RDFParser interface.
     *
     * @param in
     *            The InputStream from which to read the data, must not be
     *            <tt>null</tt>. The InputStream is supposed to contain UTF-8
     *            encoded Unicode characters, as per the Turtle specification.
     * @param baseURI
     *            The URI associated with the data in the InputStream, must not
     *            be <tt>null</tt>.
     * @throws IOException
     *             If an I/O error occurred while data was read from the
     *             InputStream.
     * @throws RDFParseException
     *             If the parser has found an unrecoverable parse error.
     * @throws RDFHandlerException
     *             If the configured statement handler encountered an
     *             unrecoverable error.
     * @throws IllegalArgumentException
     *             If the supplied input stream or base URI is <tt>null</tt>.
     */
    public synchronized void parse(InputStream in, String baseURI)
            throws IOException, RDFParseException, RDFHandlerException {
        if (in == null) {
            throw new IllegalArgumentException("Input stream must not be 'null'");
        }
        // Note: baseURI will be checked in parse(Reader, String)

        try {
            parse(new InputStreamReader(new BOMInputStream(in, false), StandardCharsets.UTF_8), baseURI);
        } catch (UnsupportedEncodingException e) {
            // Every platform should support the UTF-8 encoding...
            throw new RuntimeException(e);
        }
    }

    /**
     * Implementation of the <tt>parse(Reader, String)</tt> method defined in
     * the RDFParser interface.
     *
     * @param reader
     *            The Reader from which to read the data, must not be
     *            <tt>null</tt>.
     * @param baseURI
     *            The URI associated with the data in the Reader, must not be
     *            <tt>null</tt>.
     * @throws IOException
     *             If an I/O error occurred while data was read from the
     *             InputStream.
     * @throws RDFParseException
     *             If the parser has found an unrecoverable parse error.
     * @throws RDFHandlerException
     *             If the configured statement handler encountered an
     *             unrecoverable error.
     * @throws IllegalArgumentException
     *             If the supplied reader or base URI is <tt>null</tt>.
     */
    public synchronized void parse(Reader reader, String baseURI)
            throws IOException, RDFParseException, RDFHandlerException {
        clear();

        try {
            if (reader == null) {
                throw new IllegalArgumentException("Reader must not be 'null'");
            }
            if (baseURI == null) {
                throw new IllegalArgumentException("base URI must not be 'null'");
            }

            if (rdfHandler != null) {
                rdfHandler.startRDF();
            }

            // Start counting lines at 1:
            lineNumber = 1;

            // Allow at most 8 characters to be pushed back:
            this.reader = new PushbackReader(reader, 8);

            // Store normalized base URI
            setBaseURI(baseURI);

            reportLocation();

            int c = skipWSC();

            while (c != -1) {
                parseStatement();
                c = skipWSC();
            }
        } finally {
            clear();
        }

        if (rdfHandler != null) {
            rdfHandler.endRDF();
        }
    }

    protected void parseStatement() throws IOException, RDFParseException, RDFHandlerException {

        StringBuilder sb = new StringBuilder(8);

        int codePoint;
        // longest valid directive @prefix
        do {
            codePoint = readCodePoint();
            if (codePoint == -1 || TurtleUtil.isWhitespace(codePoint)) {
                unread(codePoint);
                break;
            }
            appendCodepoint(sb, codePoint);
        } while (sb.length() < 8);

        String directive = sb.toString();

        if (directive.startsWith("@") || directive.equalsIgnoreCase("prefix")
                || directive.equalsIgnoreCase("base")) {
            parseDirective(directive);
            skipWSC();
            // SPARQL BASE and PREFIX lines do not end in .
            if (directive.startsWith("@")) {
                verifyCharacterOrFail(readCodePoint(), ".");
            }
        } else {
            unread(directive);
            parseTriples();
            skipWSC();
            verifyCharacterOrFail(readCodePoint(), ".");
        }
    }

    protected void parseDirective(String directive) throws IOException, RDFParseException, RDFHandlerException {
        if (directive.length() >= 7 && directive.substring(0, 7).equals("@prefix")) {
            if (directive.length() > 7) {
                unread(directive.substring(7));
            }
            parsePrefixID();
        } else if (directive.length() >= 5 && directive.substring(0, 5).equals("@base")) {
            if (directive.length() > 5) {
                unread(directive.substring(5));
            }
            parseBase();
        } else if (directive.length() >= 6 && directive.substring(0, 6).equalsIgnoreCase("prefix")) {
            // SPARQL doesn't require whitespace after directive, so must unread
            // if
            // we found part of the prefixID
            if (directive.length() > 6) {
                unread(directive.substring(6));
            }
            parsePrefixID();
        } else if ((directive.length() >= 4 && directive.substring(0, 4).equalsIgnoreCase("base"))) {
            if (directive.length() > 4) {
                unread(directive.substring(4));
            }
            parseBase();
        } else if (directive.length() >= 7 && directive.substring(0, 7).equalsIgnoreCase("@prefix")) {
            if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) {
                reportFatalError("Cannot strictly support case-insensitive @prefix directive in compliance mode.");
            }
            if (directive.length() > 7) {
                unread(directive.substring(7));
            }
            parsePrefixID();
        } else if (directive.length() >= 5 && directive.substring(0, 5).equalsIgnoreCase("@base")) {
            if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) {
                reportFatalError("Cannot strictly support case-insensitive @base directive in compliance mode.");
            }
            if (directive.length() > 5) {
                unread(directive.substring(5));
            }
            parseBase();
        } else if (directive.length() == 0) {
            reportFatalError("Directive name is missing, expected @prefix or @base");
        } else {
            reportFatalError("Unknown directive \"" + directive + "\"");
        }
    }

    protected void parsePrefixID() throws IOException, RDFParseException, RDFHandlerException {
        skipWSC();

        // Read prefix ID (e.g. "rdf:" or ":")
        StringBuilder prefixID = new StringBuilder(8);

        while (true) {
            int c = readCodePoint();

            if (c == ':') {
                unread(c);
                break;
            } else if (TurtleUtil.isWhitespace(c)) {
                break;
            } else if (c == -1) {
                throwEOFException();
            }

            appendCodepoint(prefixID, c);
        }

        skipWSC();

        verifyCharacterOrFail(readCodePoint(), ":");

        skipWSC();

        // Read the namespace URI
        IRI namespace = parseURI();

        // Store and report this namespace mapping
        String prefixStr = prefixID.toString();
        String namespaceStr = namespace.toString();

        setNamespace(prefixStr, namespaceStr);

        if (rdfHandler != null) {
            rdfHandler.handleNamespace(prefixStr, namespaceStr);
        }
    }

    protected void parseBase() throws IOException, RDFParseException, RDFHandlerException {
        skipWSC();

        IRI baseURI = parseURI();

        setBaseURI(baseURI.toString());
    }

    protected void parseTriples() throws IOException, RDFParseException, RDFHandlerException {
        int c = peekCodePoint();

        // If the first character is an open bracket we need to decide which of
        // the two parsing methods for blank nodes to use
        if (c == '[') {
            c = readCodePoint();
            skipWSC();
            c = peekCodePoint();
            if (c == ']') {
                c = readCodePoint();
                subject = createNode();
                skipWSC();
                parsePredicateObjectList();
            } else {
                unread('[');
                subject = parseImplicitBlank();
            }
            skipWSC();
            c = peekCodePoint();

            // if this is not the end of the statement, recurse into the list of
            // predicate and objects, using the subject parsed above as the
            // subject
            // of the statement.
            if (c != '.') {
                parsePredicateObjectList();
            }
        } else {
            parseSubject();
            skipWSC();
            parsePredicateObjectList();
        }

        subject = null;
        predicate = null;
        object = null;
    }

    protected void parsePredicateObjectList() throws IOException, RDFParseException, RDFHandlerException {
        predicate = parsePredicate();

        skipWSC();

        parseObjectList();

        while (skipWSC() == ';') {
            readCodePoint();

            int c = skipWSC();

            if (c == '.' || // end of triple
                    c == ']' || c == '}') // end of predicateObjectList inside
            // blank
            // node
            {
                break;
            } else if (c == ';') {
                // empty predicateObjectList, skip to next
                continue;
            }

            predicate = parsePredicate();

            skipWSC();

            parseObjectList();
        }
    }

    protected void parseObjectList() throws IOException, RDFParseException, RDFHandlerException {
        parseObject();

        while (skipWSC() == ',') {
            readCodePoint();
            skipWSC();
            parseObject();
        }
    }

    protected void parseSubject() throws IOException, RDFParseException, RDFHandlerException {
        int c = peekCodePoint();

        if (c == '(') {
            subject = parseCollection();
        } else if (c == '[') {
            subject = parseImplicitBlank();
        } else {
            Value value = parseValue();

            if (value instanceof Resource) {
                subject = (Resource) value;
            } else if (value != null) {
                reportFatalError("Illegal subject value: " + value);
            }
        }
    }

    protected IRI parsePredicate() throws IOException, RDFParseException, RDFHandlerException {
        // Check if the short-cut 'a' is used
        int c1 = readCodePoint();

        if (c1 == 'a') {
            int c2 = readCodePoint();

            if (TurtleUtil.isWhitespace(c2)) {
                // Short-cut is used, return the rdf:type URI
                return RDF.TYPE;
            }

            // Short-cut is not used, unread all characters
            unread(c2);
        }
        unread(c1);

        // Predicate is a normal resource
        Value predicate = parseValue();
        if (predicate instanceof IRI) {
            return (IRI) predicate;
        } else {
            reportFatalError("Illegal predicate value: " + predicate);
            return null;
        }
    }

    protected void parseObject() throws IOException, RDFParseException, RDFHandlerException {
        int c = peekCodePoint();

        if (c == '(') {
            object = parseCollection();
        } else if (c == '[') {
            object = parseImplicitBlank();
        } else {
            object = parseValue();
            reportStatement(subject, predicate, object);
        }
    }

    /**
     * Parses a collection, e.g. <tt>( item1 item2 item3 )</tt>.
     */
    protected Resource parseCollection() throws IOException, RDFParseException, RDFHandlerException {
        verifyCharacterOrFail(readCodePoint(), "(");

        int c = skipWSC();

        if (c == ')') {
            // Empty list
            readCodePoint();
            if (subject != null) {
                reportStatement(subject, predicate, RDF.NIL);
            }
            return RDF.NIL;
        } else {
            Resource listRoot = createNode();
            if (subject != null) {
                reportStatement(subject, predicate, listRoot);
            }

            // Remember current subject and predicate
            Resource oldSubject = subject;
            IRI oldPredicate = predicate;

            // generated bNode becomes subject, predicate becomes rdf:first
            subject = listRoot;
            predicate = RDF.FIRST;

            parseObject();

            Resource bNode = listRoot;

            while (skipWSC() != ')') {
                // Create another list node and link it to the previous
                Resource newNode = createNode();
                reportStatement(bNode, RDF.REST, newNode);

                // New node becomes the current
                subject = bNode = newNode;

                parseObject();
            }

            // Skip ')'
            readCodePoint();

            // Close the list
            reportStatement(bNode, RDF.REST, RDF.NIL);

            // Restore previous subject and predicate
            subject = oldSubject;
            predicate = oldPredicate;

            return listRoot;
        }
    }

    /**
     * Parses an implicit blank node. This method parses the token <tt>[]</tt>
     * and predicateObjectLists that are surrounded by square brackets.
     */
    protected Resource parseImplicitBlank() throws IOException, RDFParseException, RDFHandlerException {
        verifyCharacterOrFail(readCodePoint(), "[");

        Resource bNode = createNode();
        if (subject != null) {
            reportStatement(subject, predicate, bNode);
        }

        skipWSC();
        int c = readCodePoint();
        if (c != ']') {
            unread(c);

            // Remember current subject and predicate
            Resource oldSubject = subject;
            IRI oldPredicate = predicate;

            // generated bNode becomes subject
            subject = bNode;

            // Enter recursion with nested predicate-object list
            skipWSC();

            parsePredicateObjectList();

            skipWSC();

            // Read closing bracket
            verifyCharacterOrFail(readCodePoint(), "]");

            // Restore previous subject and predicate
            subject = oldSubject;
            predicate = oldPredicate;
        }

        return bNode;
    }

    /**
     * Parses an RDF value. This method parses uriref, qname, node ID, quoted
     * literal, integer, double and boolean.
     */
    protected Value parseValue() throws IOException, RDFParseException, RDFHandlerException {
        int c = peekCodePoint();

        if (c == '<') {
            // uriref, e.g. <foo://bar>
            return parseURI();
        } else if (c == ':' || TurtleUtil.isPrefixStartChar(c)) {
            // qname or boolean
            return parseQNameOrBoolean();
        } else if (c == '_') {
            // node ID, e.g. _:n1
            return parseNodeID();
        } else if (c == '"' || c == '\'') {
            // quoted literal, e.g. "foo" or """foo""" or 'foo' or '''foo'''
            return parseQuotedLiteral();
        } else if (ASCIIUtil.isNumber(c) || c == '.' || c == '+' || c == '-') {
            // integer or double, e.g. 123 or 1.2e3
            return parseNumber();
        } else if (c == -1) {
            throwEOFException();
            return null;
        } else {
            reportFatalError("Expected an RDF value here, found '" + new String(Character.toChars(c)) + "'");
            return null;
        }
    }

    /**
     * Parses a quoted string, optionally followed by a language tag or
     * datatype.
     */
    protected Literal parseQuotedLiteral() throws IOException, RDFParseException, RDFHandlerException {
        String label = parseQuotedString();

        // Check for presence of a language tag or datatype
        int c = peekCodePoint();

        if (c == '@') {
            readCodePoint();

            // Read language
            StringBuilder lang = getBuilder();

            c = readCodePoint();
            if (c == -1) {
                throwEOFException();
            }

            boolean verifyLanguageTag = getParserConfig().get(BasicParserSettings.VERIFY_LANGUAGE_TAGS);
            if (verifyLanguageTag && !TurtleUtil.isLanguageStartChar(c)) {
                reportError("Expected a letter, found '" + new String(Character.toChars(c)) + "'",
                        BasicParserSettings.VERIFY_LANGUAGE_TAGS);
            }

            appendCodepoint(lang, c);

            c = readCodePoint();
            while (!TurtleUtil.isWhitespace(c)) {
                // SES-1887 : Flexibility introduced for SES-1985 and SES-1821
                // needs
                // to be counterbalanced against legitimate situations where
                // Turtle
                // language tags do not need whitespace following the language
                // tag
                if (c == '.' || c == ';' || c == ',' || c == ')' || c == ']' || c == -1) {
                    break;
                }
                if (verifyLanguageTag && !TurtleUtil.isLanguageChar(c)) {
                    reportError("Illegal language tag char: '" + new String(Character.toChars(c)) + "'",
                            BasicParserSettings.VERIFY_LANGUAGE_TAGS);
                }
                appendCodepoint(lang, c);
                c = readCodePoint();
            }

            unread(c);

            return createLiteral(label, lang.toString(), null, getLineNumber(), -1);
        } else if (c == '^') {
            readCodePoint();

            // next character should be another '^'
            verifyCharacterOrFail(readCodePoint(), "^");

            skipWSC();

            // Read datatype
            Value datatype = parseValue();
            if (datatype instanceof IRI) {
                return createLiteral(label, null, (IRI) datatype, getLineNumber(), -1);
            } else {
                reportFatalError("Illegal datatype value: " + datatype);
                return null;
            }
        } else {
            return createLiteral(label, null, null, getLineNumber(), -1);
        }
    }

    /**
     * Parses a quoted string, which is either a "normal string" or a """long
     * string""".
     */
    protected String parseQuotedString() throws IOException, RDFParseException {
        String result = null;

        int c1 = readCodePoint();

        // First character should be '"' or "'"
        verifyCharacterOrFail(c1, "\"\'");

        // Check for long-string, which starts and ends with three double quotes
        int c2 = readCodePoint();
        int c3 = readCodePoint();

        if ((c1 == '"' && c2 == '"' && c3 == '"') || (c1 == '\'' && c2 == '\'' && c3 == '\'')) {
            // Long string
            result = parseLongString(c2);
        } else {
            // Normal string
            unread(c3);
            unread(c2);

            result = parseString(c1);
        }

        // Unescape any escape sequences
        try {
            result = TurtleUtil.decodeString(result);
        } catch (IllegalArgumentException e) {
            reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES);
        }

        return result;
    }

    /**
     * Parses a "normal string". This method requires that the opening character
     * has already been parsed.
     */
    protected String parseString(int closingCharacter) throws IOException, RDFParseException {
        StringBuilder sb = getBuilder();

        while (true) {
            int c = readCodePoint();

            if (c == closingCharacter) {
                break;
            } else if (c == -1) {
                throwEOFException();
            }

            appendCodepoint(sb, c);

            if (c == '\\') {
                // This escapes the next character, which might be a '"'
                c = readCodePoint();
                if (c == -1) {
                    throwEOFException();
                }
                appendCodepoint(sb, c);
            }
        }

        return sb.toString();
    }

    /**
     * Parses a """long string""". This method requires that the first three
     * characters have already been parsed.
     */
    protected String parseLongString(int closingCharacter) throws IOException, RDFParseException {
        StringBuilder sb = getBuilder();

        int doubleQuoteCount = 0;
        int c;

        while (doubleQuoteCount < 3) {
            c = readCodePoint();

            if (c == -1) {
                throwEOFException();
            } else if (c == closingCharacter) {
                doubleQuoteCount++;
            } else {
                doubleQuoteCount = 0;
            }

            appendCodepoint(sb, c);

            if (c == '\\') {
                // This escapes the next character, which might be a '"'
                c = readCodePoint();
                if (c == -1) {
                    throwEOFException();
                }
                appendCodepoint(sb, c);
            }
        }

        return sb.substring(0, sb.length() - 3);
    }

    protected Literal parseNumber() throws IOException, RDFParseException {
        StringBuilder value = getBuilder();
        IRI datatype = XMLSchema.INTEGER;

        int c = readCodePoint();

        // read optional sign character
        if (c == '+' || c == '-') {
            appendCodepoint(value, c);
            c = readCodePoint();
        }

        while (ASCIIUtil.isNumber(c)) {
            appendCodepoint(value, c);
            c = readCodePoint();
        }

        if (c == '.' || c == 'e' || c == 'E') {

            // read optional fractional digits
            if (c == '.') {

                if (TurtleUtil.isWhitespace(peekCodePoint())) {
                    // We're parsing an integer that did not have a space before
                    // the
                    // period to end the statement
                } else {
                    appendCodepoint(value, c);

                    c = readCodePoint();

                    while (ASCIIUtil.isNumber(c)) {
                        appendCodepoint(value, c);
                        c = readCodePoint();
                    }

                    if (value.length() == 1) {
                        // We've only parsed a '.'
                        reportFatalError("Object for statement missing");
                    }

                    // We're parsing a decimal or a double
                    datatype = XMLSchema.DECIMAL;
                }
            } else {
                if (value.length() == 0) {
                    // We've only parsed an 'e' or 'E'
                    reportFatalError("Object for statement missing");
                }
            }

            // read optional exponent
            if (c == 'e' || c == 'E') {
                datatype = XMLSchema.DOUBLE;
                appendCodepoint(value, c);

                c = readCodePoint();
                if (c == '+' || c == '-') {
                    appendCodepoint(value, c);
                    c = readCodePoint();
                }

                if (!ASCIIUtil.isNumber(c)) {
                    reportError("Exponent value missing", BasicParserSettings.VERIFY_DATATYPE_VALUES);
                }

                appendCodepoint(value, c);

                c = readCodePoint();
                while (ASCIIUtil.isNumber(c)) {
                    appendCodepoint(value, c);
                    c = readCodePoint();
                }
            }
        }

        // Unread last character, it isn't part of the number
        unread(c);

        // String label = value.toString();
        // if (datatype.equals(XMLSchema.INTEGER)) {
        // try {
        // label = XMLDatatypeUtil.normalizeInteger(label);
        // }
        // catch (IllegalArgumentException e) {
        // // Note: this should never happen because of the parse constraints
        // reportError("Illegal integer value: " + label);
        // }
        // }
        // return createLiteral(label, null, datatype);

        // Return result as a typed literal
        return createLiteral(value.toString(), null, datatype, getLineNumber(), -1);
    }

    protected IRI parseURI() throws IOException, RDFParseException {
        StringBuilder uriBuf = getBuilder();

        // First character should be '<'
        int c = readCodePoint();
        verifyCharacterOrFail(c, "<");

        boolean uriIsIllegal = false;
        // Read up to the next '>' character
        while (true) {
            c = readCodePoint();

            if (c == '>') {
                break;
            } else if (c == -1) {
                throwEOFException();
            }

            if (c == ' ') {
                reportError("IRI included an unencoded space: '" + c + "'", BasicParserSettings.VERIFY_URI_SYNTAX);
                uriIsIllegal = true;
            }

            appendCodepoint(uriBuf, c);

            if (c == '\\') {
                // This escapes the next character, which might be a '>'
                c = readCodePoint();
                if (c == -1) {
                    throwEOFException();
                }
                if (c != 'u' && c != 'U') {
                    reportError("IRI includes string escapes: '\\" + c + "'",
                            BasicParserSettings.VERIFY_URI_SYNTAX);
                    uriIsIllegal = true;
                }
                appendCodepoint(uriBuf, c);
            }
        }

        if (c == '.') {
            reportError("IRI must not end in a '.'", BasicParserSettings.VERIFY_URI_SYNTAX);
            uriIsIllegal = true;
        }

        // do not report back the actual URI if it's illegal and the parser is
        // configured to verify URI syntax.
        if (!(uriIsIllegal && getParserConfig().get(BasicParserSettings.VERIFY_URI_SYNTAX))) {
            String uri = uriBuf.toString();

            // Unescape any escape sequences
            try {
                // FIXME: The following decodes \n and similar in URIs, which
                // should
                // be
                // invalid according to test <turtle-syntax-bad-uri-04.ttl>
                uri = TurtleUtil.decodeString(uri);
            } catch (IllegalArgumentException e) {
                reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES);
            }

            return super.resolveURI(uri);
        }

        return null;
    }

    /**
     * Parses qnames and boolean values, which have equivalent starting
     * characters.
     */
    protected Value parseQNameOrBoolean() throws IOException, RDFParseException {
        // First character should be a ':' or a letter
        int c = readCodePoint();
        if (c == -1) {
            throwEOFException();
        }
        if (c != ':' && !TurtleUtil.isPrefixStartChar(c)) {
            reportError("Expected a ':' or a letter, found '" + new String(Character.toChars(c)) + "'",
                    BasicParserSettings.VERIFY_RELATIVE_URIS);
        }

        String namespace = null;

        if (c == ':') {
            // qname using default namespace
            namespace = getNamespace("");
        } else {
            // c is the first letter of the prefix
            StringBuilder prefix = new StringBuilder(8);
            appendCodepoint(prefix, c);

            int previousChar = c;
            c = readCodePoint();
            while (TurtleUtil.isPrefixChar(c)) {
                appendCodepoint(prefix, c);
                previousChar = c;
                c = readCodePoint();
            }
            while (previousChar == '.' && prefix.length() > 0) {
                // '.' is a legal prefix name char, but can not appear at the end
                unread(c);
                c = previousChar;
                prefix.setLength(prefix.length() - 1);
                previousChar = prefix.codePointAt(prefix.codePointCount(0, prefix.length()) - 1);
            }

            if (c != ':') {
                // prefix may actually be a boolean value
                String value = prefix.toString();

                if (value.equals("true")) {
                    unread(c);
                    return createLiteral("true", null, XMLSchema.BOOLEAN, getLineNumber(), -1);
                } else if (value.equals("false")) {
                    unread(c);
                    return createLiteral("false", null, XMLSchema.BOOLEAN, getLineNumber(), -1);
                }
            }

            verifyCharacterOrFail(c, ":");

            namespace = getNamespace(prefix.toString());
        }

        // c == ':', read optional local name
        StringBuilder localName = new StringBuilder(16);
        c = readCodePoint();
        if (TurtleUtil.isNameStartChar(c)) {
            if (c == '\\') {
                localName.append(readLocalEscapedChar());
            } else {
                appendCodepoint(localName, c);
            }

            int previousChar = c;
            c = readCodePoint();
            while (TurtleUtil.isNameChar(c)) {
                if (c == '\\') {
                    localName.append(readLocalEscapedChar());
                } else {
                    appendCodepoint(localName, c);
                }
                previousChar = c;
                c = readCodePoint();
            }

            // Unread last character
            unread(c);

            if (previousChar == '.') {
                // '.' is a legal name char, but can not appear at the end, so
                // is
                // not actually part of the name
                unread(previousChar);
                localName.deleteCharAt(localName.length() - 1);
            }
        } else {
            // Unread last character
            unread(c);
        }

        String localNameString = localName.toString();

        for (int i = 0; i < localNameString.length(); i++) {
            if (localNameString.charAt(i) == '%') {
                if (i > localNameString.length() - 3 || !ASCIIUtil.isHex(localNameString.charAt(i + 1))
                        || !ASCIIUtil.isHex(localNameString.charAt(i + 2))) {
                    reportFatalError("Found incomplete percent-encoded sequence: " + localNameString);
                }
            }
        }

        // if (c == '.') {
        // reportFatalError("Blank node identifier must not end in a '.'");
        // }

        // Note: namespace has already been resolved
        return createURI(namespace + localNameString);
    }

    private char readLocalEscapedChar() throws RDFParseException, IOException {
        int c = readCodePoint();

        if (TurtleUtil.isLocalEscapedChar(c)) {
            return (char) c;
        } else {
            throw new RDFParseException("found '" + new String(Character.toChars(c)) + "', expected one of: "
                    + Arrays.toString(TurtleUtil.LOCAL_ESCAPED_CHARS));
        }
    }

    /**
     * Parses a blank node ID, e.g. <tt>_:node1</tt>.
     */
    protected Resource parseNodeID() throws IOException, RDFParseException {
        // Node ID should start with "_:"
        verifyCharacterOrFail(readCodePoint(), "_");
        verifyCharacterOrFail(readCodePoint(), ":");

        // Read the node ID
        int c = readCodePoint();
        if (c == -1) {
            throwEOFException();
        } else if (!TurtleUtil.isBLANK_NODE_LABEL_StartChar(c)) {
            reportError("Expected a letter, found '" + (char) c + "'", BasicParserSettings.PRESERVE_BNODE_IDS);
        }

        StringBuilder name = getBuilder();
        appendCodepoint(name, c);

        // Read all following letter and numbers, they are part of the name
        c = readCodePoint();

        // If we would never go into the loop we must unread now
        if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
            unread(c);
        }

        while (TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
            int previous = c;
            c = readCodePoint();

            if (previous == '.' && (c == -1 || TurtleUtil.isWhitespace(c) || c == '<' || c == '_')) {
                unread(c);
                unread(previous);
                break;
            }
            appendCodepoint(name, previous);
            if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
                unread(c);
            }
        }

        return createNode(name.toString());
    }

    protected void reportStatement(Resource subj, IRI pred, Value obj)
            throws RDFParseException, RDFHandlerException {
        if (subj != null && pred != null && obj != null) {
            Statement st = createStatement(subj, pred, obj);
            if (rdfHandler != null) {
                rdfHandler.handleStatement(st);
            }
        }
    }

    /**
     * Verifies that the supplied character code point <tt>codePoint</tt> is one
     * of the expected characters specified in <tt>expected</tt>. This method
     * will throw a <tt>ParseException</tt> if this is not the case.
     */
    protected void verifyCharacterOrFail(int codePoint, String expected) throws RDFParseException {
        if (codePoint == -1) {
            throwEOFException();
        }

        final String supplied = new String(Character.toChars(codePoint));

        if (expected.indexOf(supplied) == -1) {
            StringBuilder msg = new StringBuilder(32);
            msg.append("Expected ");
            for (int i = 0; i < expected.length(); i++) {
                if (i > 0) {
                    msg.append(" or ");
                }
                msg.append('\'');
                msg.append(expected.charAt(i));
                msg.append('\'');
            }
            msg.append(", found '");
            msg.append(supplied);
            msg.append("'");

            reportFatalError(msg.toString());
        }
    }

    /**
     * Consumes any white space characters (space, tab, line feed, newline) and
     * comments (#-style) from <tt>reader</tt>. After this method has been
     * called, the first character that is returned by <tt>reader</tt> is either
     * a non-ignorable character, or EOF. For convenience, this character is
     * also returned by this method.
     *
     * @return The next character code point that will be returned by
     *         <tt>reader</tt>.
     */
    protected int skipWSC() throws IOException, RDFHandlerException {
        int c = readCodePoint();
        while (TurtleUtil.isWhitespace(c) || c == '#') {
            if (c == '#') {
                processComment();
            } else if (c == '\n') {
                // we only count line feeds (LF), not carriage return (CR), as
                // normally a CR is immediately followed by a LF.
                lineNumber++;
                reportLocation();
            }

            c = readCodePoint();
        }

        unread(c);

        return c;
    }

    /**
     * Consumes characters from reader until the first EOL has been read. This
     * line of text is then passed to the {@link #rdfHandler} as a comment.
     */
    protected void processComment() throws IOException, RDFHandlerException {
        StringBuilder comment = getBuilder();
        int c = readCodePoint();
        while (c != -1 && c != 0xD && c != 0xA) {
            appendCodepoint(comment, c);
            c = readCodePoint();
        }

        if (c == 0xA) {
            lineNumber++;
        }

        // c is equal to -1, \r or \n.
        // In case c is equal to \r, we should also read a following \n.
        if (c == 0xD) {
            c = readCodePoint();
            lineNumber++;

            if (c != 0xA) {
                unread(c);
            }
        }
        if (rdfHandler != null) {
            rdfHandler.handleComment(comment.toString());
        }
        reportLocation();
    }

    /**
     * Reads the next Unicode code point.
     *
     * @return the next Unicode code point, or -1 if the end of the stream has
     *         been reached.
     * @throws IOException
     */
    protected int readCodePoint() throws IOException {
        int next = reader.read();
        if (Character.isHighSurrogate((char) next)) {
            next = Character.toCodePoint((char) next, (char) reader.read());
        }
        return next;
    }

    /**
     * Pushes back a single code point by copying it to the front of the buffer.
     * After this method returns, a call to {@link #readCodePoint()} will return
     * the same code point c again.
     *
     * @param codePoint
     *            a single Unicode code point.
     * @throws IOException
     */
    protected void unread(int codePoint) throws IOException {
        if (codePoint != -1) {
            if (Character.isSupplementaryCodePoint(codePoint)) {
                final char[] surrogatePair = Character.toChars(codePoint);
                reader.unread(surrogatePair);
            } else {
                reader.unread(codePoint);
            }
        }
    }

    /**
     * Pushes back the supplied string by copying it to the front of the buffer.
     * After this method returns, successive calls to {@link #readCodePoint()}
     * will return the code points in the supplied string again, starting at the
     * first in the String..
     *
     * @param string
     *            the string to un-read.
     * @throws IOException
     */
    protected void unread(String string) throws IOException {
        for (int i = string.codePointCount(0, string.length()); i >= 1; i--) {
            final int codePoint = string.codePointBefore(i);
            if (Character.isSupplementaryCodePoint(codePoint)) {
                final char[] surrogatePair = Character.toChars(codePoint);
                reader.unread(surrogatePair);
            } else {
                reader.unread(codePoint);
            }
        }
    }

    /**
     * Peeks at the next Unicode code point without advancing the reader, and
     * returns its value.
     *
     * @return the next Unicode code point, or -1 if the end of the stream has
     *         been reached.
     * @throws IOException
     */
    protected int peekCodePoint() throws IOException {
        int result = readCodePoint();
        unread(result);
        return result;
    }

    protected void reportLocation() {
        reportLocation(getLineNumber(), -1);
    }

    /**
     * Overrides {@link AbstractRDFParser#reportWarning(String)}, adding line
     * number information to the error.
     */
    @Override
    protected void reportWarning(String msg) {
        reportWarning(msg, getLineNumber(), -1);
    }

    /**
     * Overrides {@link AbstractRDFParser#reportError(String, RioSetting)},
     * adding line number information to the error.
     */
    @Override
    protected void reportError(String msg, RioSetting<Boolean> setting) throws RDFParseException {
        reportError(msg, getLineNumber(), -1, setting);
    }

    /**
     * Overrides {@link AbstractRDFParser#reportFatalError(String)}, adding line
     * number information to the error.
     */
    @Override
    protected void reportFatalError(String msg) throws RDFParseException {
        reportFatalError(msg, getLineNumber(), -1);
    }

    /**
     * Overrides {@link AbstractRDFParser#reportFatalError(Exception)}, adding
     * line number information to the error.
     */
    @Override
    protected void reportFatalError(Exception e) throws RDFParseException {
        reportFatalError(e, getLineNumber(), -1);
    }

    protected void throwEOFException() throws RDFParseException {
        throw new RDFParseException("Unexpected end of file");
    }

    protected int getLineNumber() {
        return lineNumber;
    }

    private StringBuilder getBuilder() {
        parsingBuilder.setLength(0);
        return parsingBuilder;
    }

    /**
     * Appends the characters from codepoint into the string builder. This is
     * the same as Character#toChars but prevents the additional char array
     * garbage for BMP codepoints.
     * 
     * @param dst
     *            the destination in which to append the characters
     * @param codePoint
     *            the codepoint to be appended
     */
    private static void appendCodepoint(StringBuilder dst, int codePoint) {
        if (Character.isBmpCodePoint(codePoint)) {
            dst.append((char) codePoint);
        } else if (Character.isValidCodePoint(codePoint)) {
            dst.append(Character.highSurrogate(codePoint));
            dst.append(Character.lowSurrogate(codePoint));
        } else {
            throw new IllegalArgumentException("Invalid codepoint " + codePoint);
        }
    }
}