{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T16:01:16Z","timestamp":1775059276834,"version":"3.50.1"},"reference-count":10,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015,8]]},"DOI":"10.1109\/iri.2015.43","type":"proceedings-article","created":{"date-parts":[[2015,10,26]],"date-time":"2015-10-26T22:07:29Z","timestamp":1445897249000},"page":"215-218","source":"Crossref","is-referenced-by-count":2,"title":["Extending Spark Analytics through Tika-Based Information Extraction and Retrieval"],"prefix":"10.1109","author":[{"given":"Rishi","family":"Verma","sequence":"first","affiliation":[]},{"given":"Chris","family":"Mattmann","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref4","article-title":"Spark: cluster computing with working sets","author":"zaharia","year":"2010","journal-title":"2nd USENIX Conf on Hot Topics in Cloud Computing (HotCloud'10)"},{"key":"ref3","article-title":"Apache Hadoop","year":"2014","journal-title":"Welcome to Apache Hadoop!"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/38.56302"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/1465482.1465560"},{"key":"ref5","article-title":"Resilient distributed datasets: a fault-tolerant abstraction for in-memory cluster computing","author":"zaharia","year":"2012","journal-title":"Proceedings of the 9th USENIX Conference on Networked Systems Design and Implementation (NSDI'12)"},{"key":"ref8","article-title":"Apache PDFBox - A Java PDF Library","year":"2015","journal-title":"Apache PDFBox Apache Software Foundation Web"},{"key":"ref7","article-title":"[SPARK-2759][CORE] Generic Binary File Support in Spark by Kmader. Pull Request #1658. Apache\/spark","author":"mader","year":"2014","journal-title":"Apache Spark Git GitHub Inc"},{"key":"ref2","article-title":"Tika in Action","author":"mattmann","year":"2012"},{"key":"ref9","article-title":"HDF: The Hierarchical Data Format","author":"fortner","year":"1998","journal-title":"Dr Dobb's Journal Software Tools for the Professional Programmer"},{"key":"ref1","article-title":"Apache Spark - Lightning-Fast Cluster Computing","year":"0","journal-title":"Apache spark - lightning-fast cluster computing"}],"event":{"name":"2015 IEEE International Conference on Information Reuse and Integration (IRI)","location":"San Francisco, CA, USA","start":{"date-parts":[[2015,8,13]]},"end":{"date-parts":[[2015,8,15]]}},"container-title":["2015 IEEE International Conference on Information Reuse and Integration"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7299513\/7300933\/07300979.pdf?arnumber=7300979","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,10,14]],"date-time":"2020-10-14T16:03:54Z","timestamp":1602691434000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/7300979"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,8]]},"references-count":10,"URL":"https:\/\/doi.org\/10.1109\/iri.2015.43","relation":{},"subject":[],"published":{"date-parts":[[2015,8]]}}}