Example usage for com.google.gson.stream JsonReader JsonReader

List of usage examples for com.google.gson.stream JsonReader JsonReader

Introduction

In this page you can find the example usage for com.google.gson.stream JsonReader JsonReader.

Prototype

public JsonReader(Reader in) 

Source Link

Document

Creates a new instance that reads a JSON-encoded stream from in .

Usage

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester.java

License:Open Source License

/**
 * processMeta - handle an individual field
 *//*from  w w w. jav  a  2s .c  o m*/
private void processMeta(DocumentPojo f, metaField m, String text, SourcePojo source,
        UnstructuredAnalysisConfigPojo uap) {

    boolean bAllowDuplicates = false;
    if ((null != m.flags) && m.flags.contains("U")) {
        bAllowDuplicates = true;
    }
    if ((null == m.scriptlang) || m.scriptlang.equalsIgnoreCase("regex")) {

        Pattern metaPattern = createRegex(m.script, m.flags);

        int timesToRun = 1;
        Object[] currField = null;
        if ((null != m.flags) && m.flags.contains("c")) {
            currField = f.getMetadata().get(m.fieldName);
        }
        if (null != currField) { // chained metadata
            timesToRun = currField.length;
            text = (String) currField[0];
        } //TESTED

        Matcher matcher = metaPattern.matcher(text);
        LinkedList<String> Llist = null;

        for (int ii = 0; ii < timesToRun; ++ii) {
            if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
                text = (String) currField[ii];
                matcher = metaPattern.matcher(text);
            } //TESTED

            StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
            int nFieldNameLen = m.fieldName.length() + 1;

            try {
                while (matcher.find()) {
                    if (null == Llist) {
                        Llist = new LinkedList<String>();
                    }
                    if (null == m.groupNum) {
                        m.groupNum = 0;
                    }
                    String toAdd = matcher.group(m.groupNum);
                    if (null != m.replace) {
                        toAdd = metaPattern.matcher(toAdd).replaceFirst(m.replace);
                    }
                    if ((null != m.flags) && m.flags.contains("H")) {
                        toAdd = StringEscapeUtils.unescapeHtml(toAdd);
                    }
                    prefix.setLength(nFieldNameLen);
                    prefix.append(toAdd);
                    String dupCheck = prefix.toString();

                    if (!regexDuplicates.contains(dupCheck)) {
                        Llist.add(toAdd);
                        if (!bAllowDuplicates) {
                            regexDuplicates.add(dupCheck);
                        }
                    }
                }
            } catch (Exception e) {
                this._context.getHarvestStatus().logMessage("processMeta1: " + e.getMessage(), true);
            }
        } //(end metadata chaining handling)
        if (null != Llist) {
            if (null != currField) { // (overwrite)
                f.getMetadata().put(m.fieldName, Llist.toArray());
            } else {
                f.addToMetadata(m.fieldName, Llist.toArray());
            }
        } //TESTED
    } else if (m.scriptlang.equalsIgnoreCase("javascript")) {
        if (null == f.getMetadata()) {
            f.setMetadata(new LinkedHashMap<String, Object[]>());
        }
        //set the script engine up if necessary
        if ((null != source) && (null != uap)) {
            //(these are null if called from new processing pipeline vs legacy code)
            intializeScriptEngine(source, uap);
        }

        try {
            //TODO (INF-2488): in new format, this should only happen in between contentMeta blocks/docs
            // (also should be able to use SAH _document object I think?)

            // Javascript: the user passes in 
            Object[] currField = f.getMetadata().get(m.fieldName);
            if ((null == m.flags) || m.flags.isEmpty()) {
                if (null == currField) {
                    engine.put("text", text);
                    engine.put("_iterator", null);
                }
                //(otherwise will just pass the current fields in there)
            } else { // flags specified
                if (m.flags.contains("t")) { // text
                    engine.put("text", text);
                }
                if (m.flags.contains("d")) { // entire document (minus ents and assocs)
                    GsonBuilder gb = new GsonBuilder();
                    Gson g = gb.create();
                    List<EntityPojo> ents = f.getEntities();
                    List<AssociationPojo> assocs = f.getAssociations();
                    try {
                        f.setEntities(null);
                        f.setAssociations(null);
                        engine.put("document", g.toJson(f));
                        securityManager.eval(engine, JavaScriptUtils.initScript);
                    } finally {
                        f.setEntities(ents);
                        f.setAssociations(assocs);
                    }
                }
                if (m.flags.contains("m")) { // metadata
                    GsonBuilder gb = new GsonBuilder();
                    Gson g = gb.create();
                    engine.put("_metadata", g.toJson(f.getMetadata()));
                    securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
                }
            } //(end flags processing)

            if (null != currField) {
                f.getMetadata().remove(m.fieldName);

                GsonBuilder gb = new GsonBuilder();
                Gson g = gb.create();
                engine.put("_iterator", g.toJson(currField));
                securityManager.eval(engine, JavaScriptUtils.iteratorDocScript);
            }
            //TESTED (handling of flags, and replacing of existing fields, including when field is null but specified)

            Object returnVal = securityManager.eval(engine, m.script);

            if (null != returnVal) {
                if (returnVal instanceof String) { // The only easy case
                    Object[] array = new Object[1];
                    if ((null != m.flags) && m.flags.contains("H")) {
                        returnVal = StringEscapeUtils.unescapeHtml((String) returnVal);
                    }
                    array[0] = returnVal;
                    f.addToMetadata(m.fieldName, array);
                } else { // complex object or array - in either case the engine turns these into
                         // internal.NativeArray or internal.NativeObject

                    BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, engine);
                    f.addToMetadata(m.fieldName, outList.toArray());
                }
            }
        } catch (ScriptException e) {

            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);

            // Just do nothing and log
            // e.printStackTrace();
            //DEBUG (don't output log messages per doc)
            //logger.error(e.getMessage());
        } catch (Exception e) {

            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);

            // Just do nothing and log
            // e.printStackTrace();
            //DEBUG (don't output log messages per doc)
            //logger.error(e.getMessage());
        }
    } else if (m.scriptlang.equalsIgnoreCase("xpath")) {

        String xpath = m.script;

        try {
            createHtmlCleanerIfNeeded();

            int timesToRun = 1;
            Object[] currField = null;
            if ((null != m.flags) && m.flags.contains("c")) {
                currField = f.getMetadata().get(m.fieldName);
            }
            if (null != currField) { // chained metadata
                f.getMetadata().remove(m.fieldName); // (so will add to the end)
                timesToRun = currField.length;
                text = (String) currField[0];
            } //TESTED

            for (int ii = 0; ii < timesToRun; ++ii) {
                if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
                    text = (String) currField[ii];
                } //TESTED

                TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));

                //NewCode : Only use html cleaner for cleansing
                //use JAXP for full Xpath lib
                Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);

                String extraRegex = extractRegexFromXpath(xpath);

                if (extraRegex != null)
                    xpath = xpath.replace(extraRegex, "");

                XPath xpa = XPathFactory.newInstance().newXPath();
                NodeList res = (NodeList) xpa.evaluate(xpath, doc, XPathConstants.NODESET);

                if (res.getLength() > 0) {
                    if ((null != m.flags) && (m.flags.contains("o"))) { // "o" for object
                        m.groupNum = -1; // (see bConvertToObject below)
                    }
                    StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
                    int nFieldNameLen = m.fieldName.length() + 1;
                    ArrayList<Object> Llist = new ArrayList<Object>(res.getLength());
                    boolean bConvertToObject = ((m.groupNum != null) && (m.groupNum == -1));
                    boolean convertToXml = ((null != m.flags) && (m.flags.contains("x")));
                    for (int i = 0; i < res.getLength(); i++) {
                        Node info_node = res.item(i);
                        if ((null != m.flags) && (m.flags.contains("g"))) {
                            Llist.add(parseHtmlTable(info_node, m.replace));
                        } else if (bConvertToObject || convertToXml) {
                            // Try to create a JSON object out of this
                            StringWriter writer = new StringWriter();
                            try {
                                Transformer transformer = TransformerFactory.newInstance().newTransformer();
                                transformer.transform(new DOMSource(info_node), new StreamResult(writer));
                            } catch (TransformerException e1) {
                                continue;
                            }

                            if (bConvertToObject) {
                                try {
                                    JSONObject subObj = XML.toJSONObject(writer.toString());
                                    if (xpath.endsWith("*")) { // (can have any number of different names here)
                                        Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj));
                                    } //TESTED
                                    else {
                                        String[] rootNames = JSONObject.getNames(subObj);
                                        if (1 == rootNames.length) {
                                            // (don't think it can't be any other number in fact)
                                            subObj = subObj.getJSONObject(rootNames[0]);
                                        }
                                        boolean bUnescapeHtml = ((null != m.flags) && m.flags.contains("H"));
                                        Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj,
                                                bUnescapeHtml));
                                    } //TESTED
                                } catch (JSONException e) { // Just carry on
                                    continue;
                                }
                                //TESTED
                            } else { // leave in XML form
                                Llist.add(writer.toString().substring(38)); // +38: (step over <?xml version="1.0" encoding="UTF-8"?>)
                            } //TESTED (xpath_test.json)
                        } else { // Treat this as string, either directly or via regex
                            String info = info_node.getTextContent().trim();
                            if (extraRegex == null || extraRegex.isEmpty()) {
                                prefix.setLength(nFieldNameLen);
                                prefix.append(info);
                                String dupCheck = prefix.toString();

                                if (!regexDuplicates.contains(dupCheck)) {
                                    if ((null != m.flags) && m.flags.contains("H")) {
                                        info = StringEscapeUtils.unescapeHtml(info);
                                    }
                                    Llist.add(info);
                                    if (!bAllowDuplicates) {
                                        regexDuplicates.add(dupCheck);
                                    }
                                }
                            } else { // Apply regex to the string
                                Pattern dataRegex = createRegex(extraRegex, m.flags);
                                Matcher dataMatcher = dataRegex.matcher(info);
                                boolean result = dataMatcher.find();
                                while (result) {
                                    String toAdd;
                                    if (m.groupNum != null)
                                        toAdd = dataMatcher.group(m.groupNum);
                                    else
                                        toAdd = dataMatcher.group();
                                    prefix.setLength(nFieldNameLen);
                                    prefix.append(toAdd);
                                    String dupCheck = prefix.toString();

                                    if (!regexDuplicates.contains(dupCheck)) {
                                        if ((null != m.flags) && m.flags.contains("H")) {
                                            toAdd = StringEscapeUtils.unescapeHtml(toAdd);
                                        }
                                        Llist.add(toAdd);
                                        if (!bAllowDuplicates) {
                                            regexDuplicates.add(dupCheck);
                                        }
                                    }

                                    result = dataMatcher.find();
                                }
                            } //(regex vs no regex)
                        } //(end string vs object)
                    }
                    if (Llist.size() > 0) {
                        f.addToMetadata(m.fieldName, Llist.toArray());
                    }
                }
            } //(end loop over metadata objects if applicable)

        } catch (IOException ioe) {
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(ioe).toString(),
                    true);

            // Just do nothing and log
            //DEBUG (don't output log messages per doc)
            //logger.error(ioe.getMessage());
        } catch (ParserConfigurationException e1) {
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(),
                    true);
            // Just do nothing and log
            //DEBUG (don't output log messages per doc)
            //logger.error(e1.getMessage());
        } catch (XPathExpressionException e1) {
            _context.getHarvestStatus().logMessage("Error evaluating xpath expression: " + xpath, true);
        }
    } else if (m.scriptlang.equalsIgnoreCase("stream")) { // XML or JSON streaming interface
        // which one?
        try {
            boolean json = false;
            boolean xml = false;
            for (int i = 0; i < 128; ++i) {
                if ('<' == text.charAt(i)) {
                    xml = true;
                    break;
                }
                if ('{' == text.charAt(i) || '[' == text.charAt(i)) {
                    json = true;
                    break;
                }
                if (!Character.isSpaceChar(text.charAt(i))) {
                    break;
                }
            } //TESTED (too many spaces: meta_stream_test, test4; incorrect chars: test3, xml: test1, json: test2)

            boolean textNotObject = m.flags == null || !m.flags.contains("o");

            List<DocumentPojo> docs = new LinkedList<DocumentPojo>();
            List<String> levelOneFields = null;
            if (null != m.script) {
                levelOneFields = Arrays.asList(m.script.split("\\s*,\\s*"));
                if ((1 == levelOneFields.size()) && levelOneFields.get(0).isEmpty()) {
                    // convert [""] to null
                    levelOneFields = null;
                }
            } //TESTED (json and xml)

            if (xml) {
                XmlToMetadataParser parser = new XmlToMetadataParser(levelOneFields, null, null, null, null,
                        null, Integer.MAX_VALUE);
                XMLInputFactory factory = XMLInputFactory.newInstance();
                factory.setProperty(XMLInputFactory.IS_COALESCING, true);
                factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
                XMLStreamReader reader = null;
                try {
                    reader = factory.createXMLStreamReader(new ByteArrayInputStream(text.getBytes()));
                    docs = parser.parseDocument(reader, textNotObject);
                } finally {
                    if (null != reader)
                        reader.close();
                }
            } //TESTED (meta_stream_test, test1)
            if (json) {
                JsonReader jsonReader = null;
                try {
                    JsonToMetadataParser parser = new JsonToMetadataParser(null, levelOneFields, null, null,
                            Integer.MAX_VALUE);
                    jsonReader = new JsonReader(
                            new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "UTF-8"));
                    jsonReader.setLenient(true);
                    docs = parser.parseDocument(jsonReader, textNotObject);
                } finally {
                    if (null != jsonReader)
                        jsonReader.close();
                }
            } //TESTED (meta_stream_test test2)

            if (!docs.isEmpty()) {
                ArrayList<String> Llist = null;
                ArrayList<Object> LlistObj = null;
                if (textNotObject) {
                    Llist = new ArrayList<String>(docs.size());
                } else {
                    LlistObj = new ArrayList<Object>(docs.size());
                }
                for (DocumentPojo doc : docs) {
                    if ((null != doc.getFullText()) || (null != doc.getMetadata())) {
                        if (textNotObject) {
                            Llist.add(doc.getFullText());
                        } //TESTED
                        else if (xml) {
                            LlistObj.add(doc.getMetadata());
                        } //TESTED
                        else if (json) {
                            Object o = doc.getMetadata();
                            if (null != o) {
                                o = doc.getMetadata().get("json");
                                if (o instanceof Object[]) {
                                    LlistObj.addAll(Arrays.asList((Object[]) o));
                                } else if (null != o) {
                                    LlistObj.add(o);
                                } //TESTED
                            }
                        } //TESTED
                    }
                } //TESTED
                if ((null != Llist) && !Llist.isEmpty()) {
                    f.addToMetadata(m.fieldName, Llist.toArray());
                } //TESTED
                if ((null != LlistObj) && !LlistObj.isEmpty()) {
                    f.addToMetadata(m.fieldName, LlistObj.toArray());
                } //TESTED

            } //TESTED (meta_stream_test test1,test2)
        } //(end try)
        catch (Exception e) { // various parsing errors
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);
        }
    } //TESTED (meta_stream_test)

    // (don't currently support other script types)
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester.java

License:Open Source License

private void parse(InfiniteFile f, SourcePojo source) throws MalformedURLException, URISyntaxException {

    //NOTE: we only ever break out of here because of max docs in standalone mode
    // (because we don't know how to continue reading)

    DocumentPojo doc = null;/*from   www .  j a v a  2 s. com*/
    //Determine File Extension
    String fileName = f.getName().toString();

    int mid = fileName.lastIndexOf(".");
    String extension = fileName.substring(mid + 1, fileName.length());

    //Checked to save processing time
    long fileTimestamp = (f.getDate() / 1000) * 1000;
    // (ensure truncated to seconds, since some operation somewhere hear does this...)

    Date modDate = new Date(fileTimestamp);
    //XML Data gets placed into MetaData

    boolean bIsXml = false;
    boolean bIsJson = false;
    boolean bIsLineOriented = false;
    if ((null != source.getFileConfig()) && (null != source.getFileConfig().type)) {
        extension = source.getFileConfig().type;
    }
    bIsXml = extension.equalsIgnoreCase("xml");
    bIsJson = extension.equalsIgnoreCase("json");
    bIsLineOriented = extension.endsWith("sv");

    if (bIsXml || bIsJson || bIsLineOriented) {
        int debugMaxDocs = Integer.MAX_VALUE; // by default don't set this, it's only for debug mode
        if (_context.isStandalone()) { // debug mode
            debugMaxDocs = maxDocsPerCycle;
        }
        //fast check to see if the file has changed before processing (or if it never existed)
        if (needsUpdated_SourceUrl(modDate, f.getUrlString(), source)) {
            if (0 != modDate.getTime()) { // if it ==0 then sourceUrl doesn't exist at all, no need to delete
                // This file already exists - in normal/managed mode will re-create
                // In streaming mode, simple skip over
                if (_streaming) {
                    return;
                } //TESTED

                DocumentPojo docRepresentingSrcUrl = new DocumentPojo();
                docRepresentingSrcUrl.setSourceUrl(f.getUrlString());
                docRepresentingSrcUrl.setSourceKey(source.getKey());
                docRepresentingSrcUrl.setCommunityId(source.getCommunityIds().iterator().next());
                sourceUrlsGettingUpdated.add(docRepresentingSrcUrl.getSourceUrl());
                this.docsToRemove.add(docRepresentingSrcUrl);
                // (can add documents with just source URL, are treated differently in the core libraries)               
            }

            SourceFileConfigPojo fileSystem = source.getFileConfig();
            if ((null == fileSystem) && (bIsXml || bIsJson)) {
                fileSystem = new SourceFileConfigPojo();
            }
            XmlToMetadataParser xmlParser = null;
            JsonToMetadataParser jsonParser = null;
            String urlType = extension;
            if (bIsXml) {
                xmlParser = new XmlToMetadataParser(fileSystem.XmlRootLevelValues, fileSystem.XmlIgnoreValues,
                        fileSystem.XmlSourceName, fileSystem.XmlPrimaryKey, fileSystem.XmlAttributePrefix,
                        fileSystem.XmlPreserveCase, debugMaxDocs);
            } //TESTED
            else if (bIsJson) {
                jsonParser = new JsonToMetadataParser(fileSystem.XmlSourceName, fileSystem.XmlRootLevelValues,
                        fileSystem.XmlPrimaryKey, fileSystem.XmlIgnoreValues, debugMaxDocs);
            } //TESTED

            List<DocumentPojo> partials = null;
            try {
                if (bIsXml) {
                    XMLStreamReader xmlStreamReader = null;
                    XMLInputFactory factory = XMLInputFactory.newInstance();
                    factory.setProperty(XMLInputFactory.IS_COALESCING, true);
                    factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
                    try {
                        xmlStreamReader = factory.createXMLStreamReader(f.getInputStream());
                        partials = xmlParser.parseDocument(xmlStreamReader);
                        long memUsage = xmlParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != xmlStreamReader)
                            xmlStreamReader.close();
                    }
                } //TESTED
                else if (bIsJson) {
                    JsonReader jsonReader = null;
                    try {
                        jsonReader = new JsonReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
                        jsonReader.setLenient(true);
                        partials = jsonParser.parseDocument(jsonReader);
                        long memUsage = jsonParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != jsonReader)
                            jsonReader.close();
                    }
                } //TESTED
                else if (bIsLineOriented) { // Just generate a document for every line

                    BufferedReader lineReader = null;
                    try {
                        lineReader = new BufferedReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
                        CsvToMetadataParser lineParser = new CsvToMetadataParser(debugMaxDocs);
                        partials = lineParser.parseDocument(lineReader, source);
                        long memUsage = lineParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != lineReader)
                            lineReader.close();
                    }
                } //TESTED

                MessageDigest md5 = null; // (generates unique urls if the user doesn't below)
                try {
                    md5 = MessageDigest.getInstance("MD5");
                } catch (NoSuchAlgorithmException e) {
                    // Do nothing, unlikely to happen...
                }
                int nIndex = 0;
                int numPartials = partials.size();
                for (DocumentPojo doctoAdd : partials) {
                    nIndex++;
                    doctoAdd.setSource(source.getTitle());
                    doctoAdd.setSourceKey(source.getKey());
                    doctoAdd.setMediaType(source.getMediaType());
                    doctoAdd.setModified(new Date(fileTimestamp));
                    doctoAdd.setCreated(new Date());

                    if (null == doctoAdd.getUrl()) { // Can be set in the parser or here
                        doctoAdd.setHasDefaultUrl(true); // (ie cannot occur in a different src URL)

                        if (1 == numPartials) {
                            String urlString = f.getUrlString();
                            if (urlString.endsWith(urlType)) {
                                doctoAdd.setUrl(urlString);
                            } else {
                                doctoAdd.setUrl(
                                        new StringBuffer(urlString).append('.').append(urlType).toString());
                            }
                            // (we always set sourceUrl as the true url of the file, so want to differentiate the URL with
                            //  some useful information)
                        } else if (null == doctoAdd.getMetadata()) { // Line oriented case
                            doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(nIndex)
                                    .append('.').append(urlType).toString());
                        } else {
                            if (null == md5) { // Will never happen, MD5 always exists
                                doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/")
                                        .append(doctoAdd.getMetadata().hashCode()).append('.').append(urlType)
                                        .toString());
                            } else { // This is the standard call if the XML parser has not been configured to build the URL
                                doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/")
                                        .append(DigestUtils.md5Hex(doctoAdd.getMetadata().toString()))
                                        .append('.').append(urlType).toString());
                            }
                        } //TESTED
                    }
                    doctoAdd.setTitle(f.getName().toString());
                    doctoAdd.setPublishedDate(new Date(fileTimestamp));
                    doctoAdd.setSourceUrl(f.getUrlString());

                    // Always add to files because I'm deleting the source URL
                    files.add(doctoAdd);
                } //TESTED 

            } catch (XMLStreamException e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            } catch (FactoryConfigurationError e1) {
                errors++;
                _context.getHarvestStatus().logMessage(e1.getMessage(), true);

            } catch (IOException e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            } catch (Exception e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            }
        } //(end if needs updated)
    } else //Tika supports Excel,Word,Powerpoint,Visio, & Outlook Documents
    {
        // (This dedup tells me if it's an add/update vs ignore - qr.isDuplicate higher up tells me if I need to add or update)
        if (needsUpdated_Url(modDate, f.getUrlString(), source)) {

            Metadata metadata = null;
            InputStream in = null;
            try {

                doc = new DocumentPojo();

                // Create a tika object (first time only)
                if (null == _tika) {
                    this.initializeTika(_context, source);
                }

                // BUGGERY
                // NEED TO LIKELY SET LIMIT TO BE 30MB or 50MB and BYPASS ANYTHING OVER THAT BELOW IS THE CODE TO DO THAT
                // tika.setMaxStringLength(30*1024*1024);
                // Disable the string length limit
                _tika.setMaxStringLength(-1);
                //input = new FileInputStream(new File(resourceLocation));
                // Create a metadata object to contain the metadata

                metadata = new Metadata();
                // Parse the file and get the text of the file
                doc.setSource(source.getTitle());
                doc.setSourceKey(source.getKey());
                doc.setMediaType(source.getMediaType());
                String fullText = "";

                in = f.getInputStream();
                try {
                    if (null == _tikaOutputFormat) { // text only
                        fullText = _tika.parseToString(in, metadata);
                    } //TESTED
                    else { // XML/HMTL
                        _tika.getParser().parse(in, _tikaOutputFormat, metadata, _tikaOutputParseContext);
                        fullText = _tikaXmlFormatWriter.toString();
                        _tikaXmlFormatWriter.getBuffer().setLength(0);
                    } //TESTED
                } finally {
                    if (null != in)
                        in.close();
                }
                int descCap = 500;
                doc.setFullText(fullText);
                if (descCap > fullText.length()) {
                    descCap = fullText.length();
                }
                doc.setDescription(fullText.substring(0, descCap));
                doc.setModified(new Date(fileTimestamp));
                doc.setCreated(new Date());
                doc.setUrl(f.getUrlString());
                doc.setTitle(f.getName().toString());
                doc.setPublishedDate(new Date(fileTimestamp));

                long memUsage = (250L * (doc.getFullText().length() + doc.getDescription().length())) / 100L; // 25% overhead, 2x for string->byte
                _memUsage += memUsage;
                _totalMemUsage.addAndGet(memUsage);

                // If the metadata contains a more plausible date then use that
                try {
                    String title = metadata.get(Metadata.TITLE);
                    if (null != title) {
                        doc.setTitle(title);
                    }
                } catch (Exception e) { // Fine just carry on                  
                }
                try {
                    Date date = metadata.getDate(Metadata.CREATION_DATE); // MS Word
                    if (null != date) {
                        doc.setPublishedDate(date);
                    } else {
                        date = metadata.getDate(Metadata.DATE); // Dublin
                        if (null != date) {
                            doc.setPublishedDate(date);
                        } else {
                            date = metadata.getDate(Metadata.ORIGINAL_DATE);
                            if (null != date) {
                                doc.setPublishedDate(date);
                            }
                        }
                    }
                } catch (Exception e) { // Fine just carry on                  
                }
                //TESTED

                // If the metadata contains a geotag then apply that:
                try {
                    String lat = metadata.get(Metadata.LATITUDE);
                    String lon = metadata.get(Metadata.LONGITUDE);
                    if ((null != lat) && (null != lon)) {
                        GeoPojo gt = new GeoPojo();
                        gt.lat = Double.parseDouble(lat);
                        gt.lon = Double.parseDouble(lon);
                        doc.setDocGeo(gt);
                    }
                } catch (Exception e) { // Fine just carry on                  
                }

                // Save the entire metadata:
                doc.addToMetadata("_FILE_METADATA_", metadata);

                for (ObjectId communityId : source.getCommunityIds()) {
                    doc.setCommunityId(communityId);
                }
                files.add(doc);

                // Close the input stream
                in.close();
                in = null;

                //TESTED

            } catch (SmbException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (MalformedURLException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (UnknownHostException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (IOException e) {
                errors++;
                _context.getHarvestStatus().logMessage(e.getMessage(), true);
            } catch (TikaException e) {
                errors++;
                _context.getHarvestStatus().logMessage(e.getMessage(), true);
            } catch (Exception e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } finally { // Close the input stream if an error occurs
                if (null != in) {
                    try {
                        in.close();
                    } catch (IOException e) {
                        // All good, do nothing
                    }
                }
            } // end exception handling
        } // end dedup check
    } // end XML vs "office" app

    //DEBUG
    //System.out.println("FILE=" + files.size() + " / MEM=" + _memUsage + " VS " + Runtime.getRuntime().totalMemory());
}

From source file:com.imaginea.betterdocs.BetterDocsAction.java

License:Apache License

private ArrayList<Integer> getLineNumbers(Collection<String> imports, String tokens) {
    ArrayList<Integer> lineNumbers = new ArrayList<Integer>();
    JsonReader reader = new JsonReader(new StringReader(tokens));
    reader.setLenient(true);// w w  w . ja va2 s  .  c  om
    JsonArray tokensArray = new JsonParser().parse(reader).getAsJsonArray();

    for (JsonElement token : tokensArray) {
        JsonObject jObject = token.getAsJsonObject();
        String importName = jObject.getAsJsonPrimitive(IMPORT_NAME).getAsString();
        if (imports.contains(importName)) {
            JsonArray lineNumbersArray = jObject.getAsJsonArray(LINE_NUMBERS);
            for (JsonElement lineNumber : lineNumbersArray) {
                lineNumbers.add(lineNumber.getAsInt());
            }
        }
    }
    return lineNumbers;
}

From source file:com.imaginea.betterdocs.BetterDocsAction.java

License:Apache License

private String getContentsForFile(String file) {
    String esFileQueryJson = getJsonForFileContent(file);
    String esFileResultJson = getESResultJson(esFileQueryJson, esURL + SOURCEFILE_SEARCH);

    JsonReader reader = new JsonReader(new StringReader(esFileResultJson));
    reader.setLenient(true);/* www . jav a2s.  c o  m*/
    JsonElement jsonElement = new JsonParser().parse(reader);
    JsonObject jsonObject = jsonElement.getAsJsonObject();
    JsonObject hitsObject = jsonObject.getAsJsonObject(HITS);
    JsonArray hitsArray = hitsObject.getAsJsonArray(HITS);
    JsonObject hitObject = hitsArray.get(0).getAsJsonObject();
    JsonObject sourceObject = hitObject.getAsJsonObject(SOURCE);
    //Replacing \r as it's treated as bad end of line character
    String fileContent = sourceObject.getAsJsonPrimitive(FILE_CONTENT).getAsString().replaceAll("\r", "");
    return fileContent;
}

From source file:com.imaginea.betterdocs.BetterDocsAction.java

License:Apache License

private static Map<String, String> getFileTokens(String esResultJson) {
    Map<String, String> fileTokenMap = new HashMap<String, String>();
    JsonReader reader = new JsonReader(new StringReader(esResultJson));
    reader.setLenient(true);/*www  . ja  v a2 s  .c  o m*/
    JsonElement jsonElement = new JsonParser().parse(reader);
    JsonObject jsonObject = jsonElement.getAsJsonObject();
    JsonObject hitsObject = jsonObject.getAsJsonObject(HITS);
    JsonArray hitsArray = hitsObject.getAsJsonArray(HITS);

    for (JsonElement hits : hitsArray) {
        JsonObject hitObject = hits.getAsJsonObject();
        JsonObject sourceObject = hitObject.getAsJsonObject(SOURCE);
        String fileName = sourceObject.getAsJsonPrimitive(FILE).getAsString();
        String tokens = sourceObject.get(TOKENS).toString();
        fileTokenMap.put(fileName, tokens);
    }
    return fileTokenMap;
}

From source file:com.imaginea.kodebeagle.base.util.ESUtils.java

License:Apache License

protected final JsonObject getJsonElements(final String esResultJson) {
    JsonReader reader = new JsonReader(new StringReader(esResultJson));
    reader.setLenient(true);// www  .j  a  v a  2  s . c  om
    JsonElement jsonElement = new JsonParser().parse(reader);
    if (jsonElement.isJsonObject()) {
        return jsonElement.getAsJsonObject().getAsJsonObject(HITS);
    } else {
        return null;
    }
}

From source file:com.imaginea.kodebeagle.base.util.JSONUtils.java

License:Apache License

public final List<Integer> getLineNumbers(final Collection<String> imports, final String tokens) {
    List<Integer> lineNumbers = new ArrayList<Integer>();
    JsonReader reader = new JsonReader(new StringReader(tokens));
    reader.setLenient(true);/*from ww w . j  av a2  s.co  m*/
    JsonArray tokensArray = new JsonParser().parse(reader).getAsJsonArray();
    for (JsonElement token : tokensArray) {
        JsonObject jObject = token.getAsJsonObject();
        String importName = jObject.getAsJsonPrimitive(IMPORT_EXACT_NAME).getAsString();
        if (imports.contains(importName)) {
            JsonArray lineNumbersArray = jObject.getAsJsonArray(LINE_NUMBERS);
            for (JsonElement lineNumberInfo : lineNumbersArray) {
                lineNumbers.add(lineNumberInfo.getAsJsonObject().getAsJsonPrimitive(LINE_NUMBER).getAsInt());
            }
        }
    }
    return lineNumbers;
}

From source file:com.imaginea.kodebeagle.base.util.SearchUtils.java

License:Apache License

protected final JsonObject getJsonElements(final String esResultJson) {
    JsonReader reader = new JsonReader(new StringReader(esResultJson));
    reader.setLenient(true);/* w w w.j a  v  a  2s . co m*/
    JsonElement jsonElement = new JsonParser().parse(reader);
    if (jsonElement.isJsonObject()) {
        return jsonElement.getAsJsonObject();
    } else {
        return null;
    }
}

From source file:com.imaginea.kodebeagle.base.util.SearchUtils.java

License:Apache License

public final List<Integer> getLineNumbers(final Collection<String> imports, final String tokens) {

    List<Integer> lineNumbers = new ArrayList<Integer>();
    JsonReader reader = new JsonReader(new StringReader(tokens));
    reader.setLenient(true);/*from   w  w  w  . java2  s .c  om*/
    JsonArray tokensArray = new JsonParser().parse(reader).getAsJsonArray();
    for (JsonElement token : tokensArray) {
        JsonObject jObject = token.getAsJsonObject();
        String typeName = jObject.getAsJsonPrimitive(TYPE_EXACT_NAME).getAsString();
        if (imports.contains(typeName)) {

            JsonArray propsArray = jObject.getAsJsonArray(PROPS);
            for (JsonElement propsInfo : propsArray) {

                JsonArray lineNumbersArray = propsInfo.getAsJsonObject().getAsJsonArray(LINES);
                for (JsonElement lineNumber : lineNumbersArray) {

                    lineNumbers.add(lineNumber.getAsJsonArray().get(0).getAsInt());

                }

            }
        }
    }
    return lineNumbers;
}

From source file:com.imminentmeals.android.core._Parcer.java

License:Apache License

private T decode(String json) throws IOException {
    JsonReader reader = new JsonReader(new StringReader(json));

    //noinspection TryFinallyCanBeTryWithResources
    try {/* ww w .java 2 s.  com*/
        reader.beginObject();

        final Class<?> type = Class.forName(reader.nextName());
        return _gson.fromJson(reader, type);
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } finally {
        reader.close();
    }
}