Example usage for java.lang CharSequence subSequence

List of usage examples for java.lang CharSequence subSequence

Introduction

In this page you can find the example usage for java.lang CharSequence subSequence.

Prototype

CharSequence subSequence(int start, int end);

Source Link

Document

Returns a CharSequence that is a subsequence of this sequence.

Usage

From source file:org.archive.extractor.RegexHTMLLinkExtractor.java

protected boolean processGeneralTag(CharSequence element, CharSequence cs) {

    Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);

    // Just in case it's an OBJECT or APPLET tag
    String codebase = null;/*from   w  w  w  .jav  a 2s  .  co  m*/
    ArrayList<String> resources = null;
    long tally = next.size();

    while (attr.find()) {
        int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
        int start = attr.start(valueGroup);
        int end = attr.end(valueGroup);
        CharSequence value = cs.subSequence(start, end);
        if (attr.start(2) > -1) {
            // HREF
            LinkContext context = new HTMLLinkContext(element, attr.group(2));
            if (element.toString().equalsIgnoreCase(LINK)) {
                // <LINK> elements treated as embeds (css, ico, etc)
                processEmbed(value, context);
            } else {
                if (element.toString().equalsIgnoreCase(BASE)) {
                    try {
                        base = UURIFactory.getInstance(value.toString());
                    } catch (URIException e) {
                        extractErrorListener.noteExtractError(e, source, value);
                    }
                }
                // other HREFs treated as links
                processLink(value, context);
            }
        } else if (attr.start(3) > -1) {
            // ACTION
            LinkContext context = new HTMLLinkContext(element, attr.group(3));
            processLink(value, context);
        } else if (attr.start(4) > -1) {
            // ON____
            processScriptCode(value); // TODO: context?
        } else if (attr.start(5) > -1) {
            // SRC etc.
            LinkContext context = new HTMLLinkContext(element, attr.group(5));
            processEmbed(value, context);
        } else if (attr.start(6) > -1) {
            // CODEBASE
            // TODO: more HTML deescaping?
            codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP);
            LinkContext context = new HTMLLinkContext(element, attr.group(6));
            processEmbed(codebase, context);
        } else if (attr.start(7) > -1) {
            // CLASSID, DATA
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            resources.add(value.toString());
        } else if (attr.start(8) > -1) {
            // ARCHIVE
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            String[] multi = TextUtils.split(WHITESPACE, value);
            for (int i = 0; i < multi.length; i++) {
                resources.add(multi[i]);
            }
        } else if (attr.start(9) > -1) {
            // CODE
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            // If element is applet and code value does not end with
            // '.class' then append '.class' to the code value.
            if (element.toString().toLowerCase().equals(APPLET)
                    && !value.toString().toLowerCase().endsWith(CLASSEXT)) {
                resources.add(value.toString() + CLASSEXT);
            } else {
                resources.add(value.toString());
            }

        } else if (attr.start(10) > -1) {
            // VALUE
            if (TextUtils.matches(LIKELY_URI_PATH, value)) {
                LinkContext context = new HTMLLinkContext(element, attr.group(10));
                processLink(value, context);
            }

        } else if (attr.start(11) > -1) {
            // any other attribute
            // ignore for now
            // could probe for path- or script-looking strings, but
            // those should be vanishingly rare in other attributes,
            // and/or symptomatic of page bugs
        }
    }
    TextUtils.recycleMatcher(attr);

    // handle codebase/resources
    if (resources == null) {
        return (tally - next.size()) > 0;
    }
    Iterator<String> iter = resources.iterator();
    UURI codebaseURI = null;
    String res = null;
    try {
        if (codebase != null) {
            // TODO: Pass in the charset.
            codebaseURI = UURIFactory.getInstance(base, codebase);
        }
        while (iter.hasNext()) {
            res = iter.next().toString();
            // TODO: more HTML deescaping?
            res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP);
            if (codebaseURI != null) {
                res = codebaseURI.resolve(res).toString();
            }
            processEmbed(res, new HTMLLinkContext(element.toString())); // TODO: include attribute too
        }
    } catch (URIException e) {
        extractErrorListener.noteExtractError(e, source, codebase);
    } catch (IllegalArgumentException e) {
        DevUtils.logger.log(Level.WARNING,
                "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(),
                e);
    }
    return (tally - next.size()) > 0;
}

From source file:com.cyberway.issue.extractor.RegexpHTMLLinkExtractor.java

protected boolean processGeneralTag(CharSequence element, CharSequence cs) {

    Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);

    // Just in case it's an OBJECT or APPLET tag
    String codebase = null;/*w  w w. ja v a  2  s.  co m*/
    ArrayList<String> resources = null;
    long tally = next.size();

    while (attr.find()) {
        int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
        int start = attr.start(valueGroup);
        int end = attr.end(valueGroup);
        CharSequence value = cs.subSequence(start, end);
        if (attr.start(2) > -1) {
            // HREF
            CharSequence context = Link.elementContext(element, attr.group(2));
            if (element.toString().equalsIgnoreCase(LINK)) {
                // <LINK> elements treated as embeds (css, ico, etc)
                processEmbed(value, context);
            } else {
                if (element.toString().equalsIgnoreCase(BASE)) {
                    try {
                        base = UURIFactory.getInstance(value.toString());
                    } catch (URIException e) {
                        extractErrorListener.noteExtractError(e, source, value);
                    }
                }
                // other HREFs treated as links
                processLink(value, context);
            }
        } else if (attr.start(3) > -1) {
            // ACTION
            CharSequence context = Link.elementContext(element, attr.group(3));
            processLink(value, context);
        } else if (attr.start(4) > -1) {
            // ON____
            processScriptCode(value); // TODO: context?
        } else if (attr.start(5) > -1) {
            // SRC etc.
            CharSequence context = Link.elementContext(element, attr.group(5));
            processEmbed(value, context);
        } else if (attr.start(6) > -1) {
            // CODEBASE
            // TODO: more HTML deescaping?
            codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP);
            CharSequence context = Link.elementContext(element, attr.group(6));
            processEmbed(codebase, context);
        } else if (attr.start(7) > -1) {
            // CLASSID, DATA
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            resources.add(value.toString());
        } else if (attr.start(8) > -1) {
            // ARCHIVE
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            String[] multi = TextUtils.split(WHITESPACE, value);
            for (int i = 0; i < multi.length; i++) {
                resources.add(multi[i]);
            }
        } else if (attr.start(9) > -1) {
            // CODE
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            // If element is applet and code value does not end with
            // '.class' then append '.class' to the code value.
            if (element.toString().toLowerCase().equals(APPLET)
                    && !value.toString().toLowerCase().endsWith(CLASSEXT)) {
                resources.add(value.toString() + CLASSEXT);
            } else {
                resources.add(value.toString());
            }

        } else if (attr.start(10) > -1) {
            // VALUE
            if (TextUtils.matches(LIKELY_URI_PATH, value)) {
                CharSequence context = Link.elementContext(element, attr.group(10));
                processLink(value, context);
            }

        } else if (attr.start(11) > -1) {
            // any other attribute
            // ignore for now
            // could probe for path- or script-looking strings, but
            // those should be vanishingly rare in other attributes,
            // and/or symptomatic of page bugs
        }
    }
    TextUtils.recycleMatcher(attr);

    // handle codebase/resources
    if (resources == null) {
        return (tally - next.size()) > 0;
    }
    Iterator iter = resources.iterator();
    UURI codebaseURI = null;
    String res = null;
    try {
        if (codebase != null) {
            // TODO: Pass in the charset.
            codebaseURI = UURIFactory.getInstance(base, codebase);
        }
        while (iter.hasNext()) {
            res = iter.next().toString();
            // TODO: more HTML deescaping?
            res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP);
            if (codebaseURI != null) {
                res = codebaseURI.resolve(res).toString();
            }
            processEmbed(res, element); // TODO: include attribute too
        }
    } catch (URIException e) {
        extractErrorListener.noteExtractError(e, source, codebase);
    } catch (IllegalArgumentException e) {
        DevUtils.logger.log(Level.WARNING,
                "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(),
                e);
    }
    return (tally - next.size()) > 0;
}

From source file:org.archive.crawler.extractor.ExtractorHTML.java

/**
 * Handle generic HREF cases.//from w  w  w .j a va 2  s .  c om
 * 
 * @param curi
 * @param value
 * @param context
 */
protected void processLink(CrawlURI curi, final CharSequence value, CharSequence context) {
    // System.out.println("I'm processLink "+curi.toString()+"");
    if (TextUtils.matches(JAVASCRIPT, value)) {
        processScriptCode(curi, value.subSequence(11, value.length()));
    } else {
        if (logger.isLoggable(Level.FINEST)) {
            logger.finest("link: " + value.toString() + " from " + curi);
        }
        addLinkFromString(curi, value, context, Link.NAVLINK_HOP);
        this.numberOfLinksExtracted++;
    }
}

From source file:com.cyberway.issue.crawler.extractor.ExtractorHTML.java

/**
 * Process metadata tags.//from  w w w .jav  a 2s .  com
 * @param curi CrawlURI we're processing.
 * @param cs Sequence from underlying ReplayCharSequence. This
 * is TRANSIENT data. Make a copy if you want the data to live outside
 * of this extractors' lifetime.
 * @return True robots exclusion metatag.
 */
protected boolean processMeta(CrawlURI curi, CharSequence cs) {
    Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
    String name = null;
    String httpEquiv = null;
    String content = null;
    while (attr.find()) {
        int valueGroup = (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
        CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
        if (attr.group(1).equalsIgnoreCase("name")) {
            name = value.toString();
        } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
            httpEquiv = value.toString();
        } else if (attr.group(1).equalsIgnoreCase("content")) {
            content = value.toString();
        }
        // TODO: handle other stuff
    }
    TextUtils.recycleMatcher(attr);

    // Look for the 'robots' meta-tag
    if ("robots".equalsIgnoreCase(name) && content != null) {
        curi.putString(A_META_ROBOTS, content);
        RobotsHonoringPolicy policy = getSettingsHandler().getOrder().getRobotsHonoringPolicy();
        String contentLower = content.toLowerCase();
        if ((policy == null || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
                && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
                && (contentLower.indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) {
            // if 'nofollow' or 'none' is specified and the
            // honoring policy is not IGNORE or CUSTOM, end html extraction
            logger.fine("HTML extraction skipped due to robots meta-tag for: " + curi.toString());
            return true;
        }
    } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
        int urlIndex = content.indexOf("=") + 1;
        if (urlIndex > 0) {
            String refreshUri = content.substring(urlIndex);
            try {
                curi.createAndAddLinkRelativeToBase(refreshUri, "meta", Link.REFER_HOP);
            } catch (URIException e) {
                if (getController() != null) {
                    getController().logUriError(e, curi.getUURI(), refreshUri);
                } else {
                    logger.info("Failed createAndAddLinkRelativeToBase " + curi + ", " + cs + ", " + refreshUri
                            + ": " + e);
                }
            }
        }
    }
    return false;
}

From source file:org.getobjects.appserver.core.WOMessage.java

public Appendable append(final CharSequence _s, int _start, int _end) throws IOException {
    this.appendContentHTMLString(_s.subSequence(_start, _end).toString());
    return this;
}

From source file:org.archive.modules.extractor.ExtractorHTML.java

/**
 * Handle generic HREF cases.//from   w  w  w  . j a va2 s . c  o m
 * 
 * @param curi
 * @param value
 * @param context
 */
protected void processLink(CrawlURI curi, final CharSequence value, CharSequence context) {
    if (TextUtils.matches(JAVASCRIPT, value)) {
        processScriptCode(curi, value.subSequence(11, value.length()));
    } else {
        if (logger.isLoggable(Level.FINEST)) {
            logger.finest("link: " + value.toString() + " from " + curi);
        }
        addLinkFromString(curi, value, context, Hop.NAVLINK);
        numberOfLinksExtracted.incrementAndGet();
    }
}

From source file:org.archive.modules.extractor.ExtractorHTML.java

/**
 * Process style text./*from  w w w .  j ava  2s . co  m*/
 * @param curi CrawlURI we're processing.
 * @param sequence Sequence from underlying ReplayCharSequence. This
 * is TRANSIENT data. Make a copy if you want the data to live outside
 * of this extractors' lifetime.
 * @param endOfOpenTag
 */
protected void processStyle(CrawlURI curi, CharSequence sequence, int endOfOpenTag) {
    // First, get attributes of script-open tag as per any other tag.
    processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));

    // then, parse for URIs
    numberOfLinksExtracted.addAndGet(
            ExtractorCSS.processStyleCode(this, curi, sequence.subSequence(endOfOpenTag, sequence.length())));
}

From source file:com.jecelyin.editor.v2.core.text.TextUtils.java

/**
 * Debugging tool to print the spans in a CharSequence.  The output will
 * be printed one span per line.  If the CharSequence is not a Spanned,
 * then the entire string will be printed on a single line.
 *///from ww w  .j  a  v a 2s  .  c  om
public static void dumpSpans(CharSequence cs, Printer printer, String prefix) {
    if (cs instanceof Spanned) {
        Spanned sp = (Spanned) cs;
        Object[] os = sp.getSpans(0, cs.length(), Object.class);

        for (int i = 0; i < os.length; i++) {
            Object o = os[i];
            printer.println(prefix + cs.subSequence(sp.getSpanStart(o), sp.getSpanEnd(o)) + ": "
                    + Integer.toHexString(System.identityHashCode(o)) + " " + o.getClass().getCanonicalName()
                    + " (" + sp.getSpanStart(o) + "-" + sp.getSpanEnd(o) + ") fl=#" + sp.getSpanFlags(o));
        }
    } else {
        printer.println(prefix + cs + ": (no spans)");
    }
}

From source file:org.archive.modules.extractor.ExtractorHTML.java

protected void processScript(CrawlURI curi, CharSequence sequence, int endOfOpenTag) {
    // first, get attributes of script-open tag
    // as per any other tag
    processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));

    // then, apply best-effort string-analysis heuristics
    // against any code present (false positives are OK)
    processScriptCode(curi, sequence.subSequence(endOfOpenTag, sequence.length()));
}

From source file:edu.cornell.med.icb.goby.util.SimulateBisulfiteReads.java

protected void process(CharSequence segmentBases, int from, Writer writer) throws IOException {

    int segmentLength = segmentBases.length();
    for (int repeatCount = 0; repeatCount < numRepeats; repeatCount++) {
        int startReadPosition = choose(0, Math.max(0, segmentBases.length() - 1 - readLength));
        boolean matchedReverseStrand = doReverseStrand && doForwardStrand ? random.nextBoolean()
                : doReverseStrand;//w ww . ja  va 2 s  .co m
        if (matchedReverseStrand && !doReverseStrand)
            continue;
        if (!matchedReverseStrand && !doForwardStrand)
            continue;

        final CharSequence selectedReadRegion = segmentBases.subSequence(startReadPosition,
                startReadPosition + readLength);
        CharSequence readBases = matchedReverseStrand ? reverseComplement(selectedReadRegion)
                : selectedReadRegion;

        MutableString sequenceInitial = new MutableString();
        MutableString sequenceTreated = new MutableString();
        MutableString log = new MutableString();
        IntArrayList mutatedPositions = new IntArrayList();

        for (int i = 0; i < readLength; i++) {

            char base = readBases.charAt(i);
            // genomic position is zero-based
            int genomicPosition = matchedReverseStrand ? readLength - (i + 1) + from + startReadPosition
                    : i + startReadPosition + from;
            sequenceInitial.append(base);

            if (base == 'C') {

                boolean isBaseMethylated = random
                        .nextDouble() <= getMethylationRateAtPosition(matchedReverseStrand, genomicPosition);

                if (isBaseMethylated) {
                    // base is methylated, stays a C on forward or reverse strand
                    if (!bisulfiteTreatment) {
                        // mutate base to G
                        // introduce mutation C -> G
                        base = 'G';

                    }
                    // bases that are methylated are protected and stay C on the forward strand. They would also
                    // be seen as G on the opposite strand if the sequencing protocol did not respect strandness
                    log.append(bisulfiteTreatment ? "met: " : "mut: ");
                    log.append(genomicPosition + 1); // write 1-based position
                    log.append(' ');

                    log.append("read-index: ");
                    log.append(i + 1);
                    log.append(' ');
                    mutatedPositions.add(genomicPosition);

                } else {
                    // bases that are not methylated are changed to T through the bisulfite and PCR conversion steps
                    if (bisulfiteTreatment) {
                        base = 'T';

                    }

                }
            }
            sequenceTreated.append(base);
        }
        MutableString coveredPositions = new MutableString();
        MutableString qualityScores = new MutableString();
        for (int i = 0; i < readLength; i++) {
            final char c = QualityEncoding.ILLUMINA.phredQualityScoreToAsciiEncoding((byte) 40);
            qualityScores.append(c);

        }
        // zero-based positions covered by the read:
        IntArrayList readCoveredPositions = new IntArrayList();

        for (int i = startReadPosition + from; i < startReadPosition + from + readLength; i++) {
            // positions are written 1-based
            coveredPositions.append(i + 1);
            coveredPositions.append(" ");
            readCoveredPositions.add(i);
        }

        readCoveredPositions.retainAll(mutatedPositions);
        assert readCoveredPositions.size() == mutatedPositions
                .size() : "positions mutated or changed must be covered by read.";
        //   System.out.printf("initial: %s%nbis:     %s%n", sequenceInitial, sequenceTreated);
        writer.write(String.format("@%d reference: %s startPosition: %d strand: %s %s %s%n%s%n+%n%s%n",
                repeatCount, refChoice, startReadPosition, matchedReverseStrand ? "-1" : "+1", log,
                coveredPositions, complement(sequenceTreated), qualityScores));
    }
    writer.flush();

}