{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:05:38Z","timestamp":1775815538673,"version":"3.50.1"},"reference-count":66,"publisher":"Elsevier BV","issue":"6","license":[{"start":{"date-parts":[[2007,9,1]],"date-time":"2007-09-01T00:00:00Z","timestamp":1188604800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2007,9,1]],"date-time":"2007-09-01T00:00:00Z","timestamp":1188604800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Systems"],"published-print":{"date-parts":[[2007,9]]},"DOI":"10.1016\/j.is.2006.09.004","type":"journal-article","created":{"date-parts":[[2006,11,8]],"date-time":"2006-11-08T07:30:00Z","timestamp":1162971000000},"page":"886-908","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":59,"title":["Combining text and link analysis for focused crawling\u2014An application for vertical search engines"],"prefix":"10.1016","volume":"32","author":[{"given":"G.","family":"Almpanidis","sequence":"first","affiliation":[]},{"given":"C.","family":"Kotropoulos","sequence":"additional","affiliation":[]},{"given":"I.","family":"Pitas","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.is.2006.09.004_bib2","unstructured":"D. Sullivan, The vortals are coming! The vortals are coming! From The Search Engine Report. Online at \u3008http:\/\/searchenginewatch.com\/sereport\/00\/04-vortals.html\u3009."},{"key":"10.1016\/j.is.2006.09.004_bib3","unstructured":"D. Sullivan, Now, it's the \u201cvectories\u201d that are coming!. From The Search Engine Report. Online at \u3008http:\/\/searchenginewatch.com\/sereport\/00\/08-vectories.html\u3009."},{"key":"10.1016\/j.is.2006.09.004_bib4","unstructured":"R. Steele, Techniques for specialized search engines, in: Proceedings of the Internet Computing \u201901, Las Vegas, June 25\u201328, 2001."},{"key":"10.1016\/j.is.2006.09.004_bib5","unstructured":"K. Rijsbergen, Information Retrieval. Online \u3008http:\/\/www.dcs.gla.ac.uk\/Keith\/Preface.html\u3009."},{"key":"10.1016\/j.is.2006.09.004_bib6","doi-asserted-by":"crossref","unstructured":"B. Davison, Topical locality in the Web, in: Proceedings of the 23rd ACM International Conference on Research and Development in Information Retrieval (SIGIR 2000), Athens, Greece, 2000, pp. 272\u2013279.","DOI":"10.1145\/345508.345597"},{"issue":"11","key":"10.1016\/j.is.2006.09.004_bib7","doi-asserted-by":"crossref","first-page":"613","DOI":"10.1145\/361219.361220","article-title":"A vector space model for automatic indexing","volume":"18","author":"Salton","year":"1975","journal-title":"Commun. ACM"},{"issue":"1","key":"10.1016\/j.is.2006.09.004_bib8","doi-asserted-by":"crossref","first-page":"30","DOI":"10.1108\/eb026672","article-title":"Search term relevance weighting given little relevance information","volume":"35","author":"Jones","year":"1979","journal-title":"J. Doc."},{"key":"10.1016\/j.is.2006.09.004_bib9","doi-asserted-by":"crossref","first-page":"391","DOI":"10.1002\/(SICI)1097-4571(199009)41:6<391::AID-ASI1>3.0.CO;2-9","article-title":"Indexing by latent semantic analysis","volume":"41","author":"Deerwester","year":"1990","journal-title":"J. Am. Soc. Inform. Sci."},{"key":"10.1016\/j.is.2006.09.004_bib10","doi-asserted-by":"crossref","unstructured":"P. Foltz, Improving human-proceedings interaction: indexing the CHI index, in: Proceedings of the Conference on Human Factors in Computing Systems, Denver, 1995, pp. 101\u2013102.","DOI":"10.1145\/223355.223450"},{"issue":"4","key":"10.1016\/j.is.2006.09.004_bib11","doi-asserted-by":"crossref","first-page":"573","DOI":"10.1137\/1037127","article-title":"Using linear algebra for intelligent information retrieval","volume":"37","author":"Berry","year":"1995","journal-title":"SIAM Rev."},{"key":"10.1016\/j.is.2006.09.004_bib12","series-title":"Understanding Search Engines: Mathematical Modeling and Text Retrieval","author":"Berry","year":"1999"},{"key":"10.1016\/j.is.2006.09.004_bib13","unstructured":"G. O\u2019Brien, Information management tools for updating an SVD-encoded indexing scheme, Master's thesis, University of Tennessee, Knoxville, TN, 1994."},{"key":"10.1016\/j.is.2006.09.004_bib14","series-title":"Social Network Analysis","author":"Wasserman","year":"1994"},{"key":"10.1016\/j.is.2006.09.004_bib15","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1002\/asi.5090140103","article-title":"Bibliographic coupling between scientific papers","volume":"14","author":"Kessler","year":"1963","journal-title":"Am. Doc."},{"key":"10.1016\/j.is.2006.09.004_bib16","doi-asserted-by":"crossref","first-page":"265","DOI":"10.1002\/asi.4630240406","article-title":"Co-citation in scientific literature: a new measure of the relationship between two documents","volume":"24","author":"Small","year":"1973","journal-title":"J. Am. Soc. Inform. Sci."},{"issue":"5","key":"10.1016\/j.is.2006.09.004_bib17","doi-asserted-by":"crossref","first-page":"445","DOI":"10.1016\/0306-4573(86)90091-9","volume":"22","year":"1986","journal-title":"Inform. Process. Manage."},{"key":"10.1016\/j.is.2006.09.004_bib18","doi-asserted-by":"crossref","unstructured":"A. Ng, A. Zheng, M. Jordan, Stable algorithms for link analysis, in: Proceedings of the ACM International Conference on Research and Development in Information Retrieval (SIGIR 2001), 2001, pp. 258\u2013266.","DOI":"10.1145\/383952.384003"},{"key":"10.1016\/j.is.2006.09.004_bib19","series-title":"Proceedings of the 10th Text Rerieval Conference (TREC-10)","article-title":"Combining text- and link-based methods for Web IR","author":"Yang","year":"2002"},{"key":"10.1016\/j.is.2006.09.004_bib20","doi-asserted-by":"crossref","unstructured":"S. Haas, E. Grams, Page and link classifications: connecting diverse resources, in: Proceedings of the Third ACM Conference, Digital libraries (DL 1998), Pittsburgh, 1998, pp. 99\u2013107.","DOI":"10.1145\/276675.276686"},{"key":"10.1016\/j.is.2006.09.004_bib21","unstructured":"CLEVER Project. IBM Almaden Research Center, Online at \u3008http:\/\/www.almaden.ibm.com\/projects\/clever.shtml\u3009."},{"issue":"1\u20137","key":"10.1016\/j.is.2006.09.004_bib22","first-page":"107","article-title":"The anatomy of a large-scale hypertextual web search engine","volume":"30","author":"Brin","year":"1998","journal-title":"WWW7\/Computer Networks"},{"key":"10.1016\/j.is.2006.09.004_bib23","unstructured":"J. Kleinberg, Authoritative sources in a hyperlinked environment, in: Proceedings of the Ninth Annual ACM-SIAM Symposium, Discrete Algorithms, January 1998, pp. 668\u2013677."},{"issue":"3","key":"10.1016\/j.is.2006.09.004_bib24","first-page":"3","article-title":"Link analysis in web information retrieval","volume":"23","author":"Henzinger","year":"2000","journal-title":"IEEE Data Eng. Bull."},{"key":"10.1016\/j.is.2006.09.004_bib25","doi-asserted-by":"crossref","unstructured":"K. Bharat, M. Henzinger, Improved algorithms for topic distillation in hyperlinked environments, in: Proceedings of the ACM International Conference on Research and Development Information Retrieval (SIGIR 1998), Melbourne (Australia), August 1998, pp. 104\u2013111.","DOI":"10.1145\/290941.290972"},{"key":"10.1016\/j.is.2006.09.004_bib26","first-page":"430","article-title":"The missing link\u2014a probabilistic model of document content and hypertext connectivity","volume":"vol. 13","author":"Cohn","year":"2001"},{"key":"10.1016\/j.is.2006.09.004_bib27","unstructured":"D. Cohn, H. Chang, Learning to probabilistically identify authoritative documents. In: Proceedings of the 17th International Conference on Machine Learning, Stanford University, 2000, pp. 167\u2013174."},{"key":"10.1016\/j.is.2006.09.004_bib28","series-title":"Modern Information Retrieval","author":"Baeza-Yates","year":"1999"},{"key":"10.1016\/j.is.2006.09.004_bib29","doi-asserted-by":"crossref","unstructured":"A. Gulli, A. Signorini, The indexable Web is more than 11.5 billion pages, in: Proceedings of the 14th International Conference on WWW (WWW05), 2005, pp. 902\u2013903.","DOI":"10.1145\/1062745.1062789"},{"key":"10.1016\/j.is.2006.09.004_bib30","unstructured":"J. Hynek, K. Jezek, Document classification using itemsets, in: Proceedings of the 34th International Conference on MOSIS 2000, pp. 97\u2013102."},{"key":"10.1016\/j.is.2006.09.004_bib31","unstructured":"D. Zeinalipou-Yatzi, M. Dikaiakos, High-performance crawling and filtering in Java, in: Proceedings of the Eighth Panhellenic Conference on Informatics, vol. 2, November 2001, pp. 377\u2013386."},{"key":"10.1016\/j.is.2006.09.004_bib33","doi-asserted-by":"crossref","unstructured":"M. Najork, J. Wiener, Breadth-first search crawling yields high-quality pages, in: Proceedings of the 10th International Conference on World Wide Web (WWW01), 2001, pp. 114\u2013118.","DOI":"10.1145\/371920.371965"},{"key":"10.1016\/j.is.2006.09.004_bib34","doi-asserted-by":"crossref","first-page":"1623","DOI":"10.1016\/S1389-1286(99)00052-3","article-title":"Focused crawling: a new approach to topic-specific Web resource discovery","volume":"31","author":"Chakrabarti","year":"1999","journal-title":"Comput. Networks"},{"issue":"5","key":"10.1016\/j.is.2006.09.004_bib35","doi-asserted-by":"crossref","first-page":"55","DOI":"10.1109\/MC.2003.1198237","article-title":"Comparison of three vertical search spiders","volume":"36","author":"Chau","year":"2003","journal-title":"IEEE Comput. Mag."},{"issue":"1","key":"10.1016\/j.is.2006.09.004_bib36","doi-asserted-by":"crossref","first-page":"2","DOI":"10.1145\/383034.383035","article-title":"Searching the web","author":"Arasu","year":"2001","journal-title":"ACM Trans. Internet Technol."},{"key":"10.1016\/j.is.2006.09.004_bib37","doi-asserted-by":"crossref","unstructured":"J. Cho, H.G. Molina, L. Page, Efficient crawling through URL ordering. In: Proceedings of the Seventh International Conference on World Wide Web (WWW98), Brisbane, Australia, 1998, pp. 161\u2013172.","DOI":"10.1016\/S0169-7552(98)00108-1"},{"key":"10.1016\/j.is.2006.09.004_bib38","unstructured":"P. Srinivasan, G. Pant, F. Menczer, Target seeking crawlers and their topical performance. In: Proceedings of the ACM International Conference on Research and Development in Information Retrieval (SIGIR 2002), August 2002."},{"key":"10.1016\/j.is.2006.09.004_bib39","doi-asserted-by":"crossref","unstructured":"T. Haveliwala, Topic-sensitive PageRank, in: Proceedings of the 11th International Conference on World Wide Web (WWW02), Honolulu, Hawaii, May 2002, pp. 517\u2013526.","DOI":"10.1145\/511446.511513"},{"key":"10.1016\/j.is.2006.09.004_bib40","doi-asserted-by":"crossref","unstructured":"R. Baeza-Yates, C. Castillo, M. Marin, A. Rodriguez, Crawling a country: better strategies than breadth-first for web page ordering, in: Proceedings of the International Conference on World Wide Web (WWW05), Chiba, Japan, 2005, pp. 864\u2013872.","DOI":"10.1145\/1062745.1062768"},{"key":"10.1016\/j.is.2006.09.004_bib41","doi-asserted-by":"crossref","unstructured":"P. Boldi, M. Santini, S. Vigna, Do your worst to make the best: paradoxical effects in PageRank incremental computations, in: Proceedings of the Algorithms and Models for the Web-Graph: Third International Workshop (WAW 2004), Rome, Italy, October 2004, pp. 168\u2013180.","DOI":"10.1007\/978-3-540-30216-2_14"},{"key":"10.1016\/j.is.2006.09.004_bib42","unstructured":"M. Diligenti, F. Coetzee, S. Lawrence, C.L. Giles, M. Gori, Focused crawling using context graphs, in: Proceedings of the 26th International Conference on Very Large Databases (VLDB 2000), Cairo, Egypt, 2000, pp. 527\u2013534."},{"key":"10.1016\/j.is.2006.09.004_bib43","unstructured":"J. Rennie, A. McCallum, Using reinforcement learning to spider the web efficiently, in: Proceedings of the 16th International Conference on Machine Learning (ICML99), 1999, pp. 335\u2013343."},{"key":"10.1016\/j.is.2006.09.004_bib44","doi-asserted-by":"crossref","unstructured":"S. Chakrabarti, Integrating the document object model with hyperlinks for enhanced topic distillation and information extraction, in: Proceedings of the 10th International Conference on World Wide Web (WWW10), Hong Kong, 2001, pp. 211\u2013220.","DOI":"10.1145\/371920.372054"},{"key":"10.1016\/j.is.2006.09.004_bib45","doi-asserted-by":"crossref","unstructured":"C. Aggarwal, F. Al-Garawi, P. Yu, Intelligent crawling on the world wide web with arbitrary predicates, in: Proceedings of the 10th International World Wide Web Conference (WWW10), Hong Kong, 2001, pp. 96\u2013105.","DOI":"10.1145\/371920.371955"},{"key":"10.1016\/j.is.2006.09.004_bib46","unstructured":"S. Sizov, M. Theobald, S. Siersdorfer, G. Weikum, J. Graupmann, M. Biwer, P. Zimmer, The BINGO! system for information portal generation and expert web search, in: Proceedings of the First Conference on Innovative Data Systems Research (CIDR), 2003."},{"issue":"8","key":"10.1016\/j.is.2006.09.004_bib47","first-page":"711","article-title":"A scalable fully distributed web crawler","volume":"34","author":"Boldi","year":"2004","journal-title":"Software: Pract. Experience"},{"issue":"6","key":"10.1016\/j.is.2006.09.004_bib48","first-page":"585","article-title":"THESUS: effective thematic selection and organization of web document collections based on link semantics","volume":"16","author":"Varlamis","year":"2004","journal-title":"IEEE Trans. Knowledge Data Eng."},{"key":"10.1016\/j.is.2006.09.004_bib49","doi-asserted-by":"crossref","unstructured":"D. Bergmark, C. Lagoze, A. Sbityakov, Focused crawls, tunneling, and digital libraries, in: Proceedings of the Sixth European Conference on Research and Advanced Technology for Digital Libraries, 2002, pp. 91\u2013106.","DOI":"10.1007\/3-540-45747-X_7"},{"key":"10.1016\/j.is.2006.09.004_bib50","doi-asserted-by":"crossref","unstructured":"F. Menczer, G. Pant, M. Ruiz, P. Srinivasan, Evaluating topic-driven web crawlers, in: Proceedings of the ACM International Conference on Research and Development in Information Retrieval (SIGIR 2001), New Orleans, 2001, pp. 241\u2013249.","DOI":"10.1145\/383952.383995"},{"key":"10.1016\/j.is.2006.09.004_bib51","unstructured":"F. Menczer, ARACHNID: adaptive retrieval agents choosing heuristic neighborhoods for information discovery, in: Proceedings of the 14th International Conference on Machine Learning, 1997, pp. 227\u2013235."},{"issue":"2","key":"10.1016\/j.is.2006.09.004_bib53","doi-asserted-by":"crossref","first-page":"131","DOI":"10.1177\/0165551506062326","article-title":"The freshness of Web search engines\u2019 databases","volume":"32","author":"Lewandowski","year":"2006","journal-title":"Inform. Sci."},{"key":"10.1016\/j.is.2006.09.004_bib54","doi-asserted-by":"crossref","unstructured":"J. Leskovec, J. Kleinberg, C. Faloutsos, Graphs over time: densification laws, shrinking diameters and possible explanations, in: Proceedings of the 11th ACM SIGKDD International Conference on Knowledge Discovery in Data Mining (KDD05), 2005, pp. 177\u2013187.","DOI":"10.1145\/1081870.1081893"},{"key":"10.1016\/j.is.2006.09.004_bib55","series-title":"Mining the Web: Discovering Knowledge from Hypertext Data","author":"Chakrabarti","year":"2002"},{"key":"10.1016\/j.is.2006.09.004_bib56","doi-asserted-by":"crossref","first-page":"317","DOI":"10.1016\/S0169-7552(98)00038-5","article-title":"The shark-search algorithm. An application: tailored Web site mapping","volume":"30","author":"Hersovici","year":"1998","journal-title":"Comput. Networks ISDN Syst."},{"key":"10.1016\/j.is.2006.09.004_bib57","unstructured":"S. Zelikovitz, H. Hirsh, Improving text classification with LSI using background knowledge, Workshop Notes Text Learning: Beyond Supervision (IJCAI01)."},{"issue":"27","key":"10.1016\/j.is.2006.09.004_bib58","doi-asserted-by":"crossref","first-page":"183","DOI":"10.1016\/0169-7552(94)90132-5","article-title":"Information retrieval in the world-wide web: making client-based searching feasible","author":"De Bra","year":"1994","journal-title":"J. Comput. Networks ISDN Syst."},{"issue":"3","key":"10.1016\/j.is.2006.09.004_bib60","doi-asserted-by":"crossref","first-page":"130","DOI":"10.1108\/eb046814","article-title":"An algorithm for suffix stripping","volume":"14","author":"Porter","year":"1980","journal-title":"Program"},{"key":"10.1016\/j.is.2006.09.004_bib61","unstructured":"G. Pant, P. Srinivasan, F. Menczer, Exploration versus exploitation in topic driven crawlers, in: Proceedings of the Second International Workshop Web Dynamics, Honolulu, May, 2002."},{"key":"10.1016\/j.is.2006.09.004_bib62","doi-asserted-by":"crossref","unstructured":"S. Robertson, S. Walker, M. Hancock-Beaulieu, A. Gull, M. Lau, Okapi at TREC, in: Proceedings of the First Text REtrieval Conference (TREC-1), Gaitherburg, Maryland, 1992, pp. 21\u201330.","DOI":"10.6028\/NIST.SP.500-215.city"},{"key":"10.1016\/j.is.2006.09.004_bib63","doi-asserted-by":"crossref","unstructured":"C. Tang, S. Dwarkadas, Z. Xu, On scaling latent semantic indexing for large peer-to-peer systems, in: Proceedings of the 27th ACM International Conference on Research and Development in Information Retrieval (SIGIR 2004), Sheffield, UK, July 25\u201329, 2004, pp. 112\u2013121.","DOI":"10.1145\/1008992.1009014"},{"key":"10.1016\/j.is.2006.09.004_bib64","series-title":"Matrix Computations","author":"Golub","year":"1996"},{"issue":"1","key":"10.1016\/j.is.2006.09.004_bib65","doi-asserted-by":"crossref","first-page":"56","DOI":"10.1016\/j.ipm.2004.11.007","article-title":"A framework for understanding LSI performance","volume":"42","author":"Kontostathis","year":"2006","journal-title":"Inform. Process. Manage."},{"key":"10.1016\/j.is.2006.09.004_bib66","doi-asserted-by":"crossref","first-page":"127","DOI":"10.1023\/A:1009953814988","article-title":"Automating the construction of internet portals with machine learning","volume":"3","author":"McCallum","year":"2000","journal-title":"Inform. Retrieval"},{"key":"10.1016\/j.is.2006.09.004_bib67","doi-asserted-by":"crossref","unstructured":"M. Fisher, R. Everson, When are links useful? Experiments in text classification, in: Proceedings of the 25th European Conference on Advances Information Retrieval (ECIR), Pisa, April 2003, pp. 41\u201356.","DOI":"10.1007\/3-540-36618-0_4"},{"key":"10.1016\/j.is.2006.09.004_bib68","unstructured":"B. Davison, Unifying text and link analysis, in: Proceedings of the Workshop Text-Mining & Link-Analysis (TextLink) (IJCAI), Acapulco, August 9, 2003."},{"key":"10.1016\/j.is.2006.09.004_bib69","doi-asserted-by":"crossref","unstructured":"W. Xi, E. Fox, W. Fan, B. Zhang, Z. Chen, J. Yan, D. Zhuang, SimFusion: measuring similarity using unified relationship matrix, in: Proceedings of the ACM International Conference on Research and Development in Information Retrieval (SIGIR 2005), 2005, pp. 130\u2013137.","DOI":"10.1145\/1076034.1076059"},{"key":"10.1016\/j.is.2006.09.004_bib70","doi-asserted-by":"crossref","unstructured":"R. Baeza-Yates, C. Castillo, M. Marin, A. Rodriguez, Crawling a country: better strategies than breadth-first for web page ordering, in: Proceedings of the 14th International Conference on World Wide Web (WWW 05), 2005, pp. 864\u2013872.","DOI":"10.1145\/1062745.1062768"}],"container-title":["Information Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306437906000792?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306437906000792?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,10,5]],"date-time":"2025-10-05T15:05:32Z","timestamp":1759676732000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0306437906000792"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2007,9]]},"references-count":66,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2007,9]]}},"alternative-id":["S0306437906000792"],"URL":"https:\/\/doi.org\/10.1016\/j.is.2006.09.004","relation":{},"ISSN":["0306-4379"],"issn-type":[{"value":"0306-4379","type":"print"}],"subject":[],"published":{"date-parts":[[2007,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Combining text and link analysis for focused crawling\u2014An application for vertical search engines","name":"articletitle","label":"Article Title"},{"value":"Information Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.is.2006.09.004","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"Copyright \u00a9 2006 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}