{"id":1157,"date":"2019-06-22T22:11:04","date_gmt":"2019-06-22T14:11:04","guid":{"rendered":"https:\/\/199604.com\/?p=1157"},"modified":"2019-06-22T22:11:42","modified_gmt":"2019-06-22T14:11:42","slug":"jsoup%e5%b0%8f%e4%be%8b%e5%ad%90","status":"publish","type":"post","link":"https:\/\/199604.com\/1157","title":{"rendered":"JSoup\u5c0f\u4f8b\u5b50"},"content":{"rendered":"\n<p>emmmm&#8230;\u597d\u50cf\u6ca1\u6709\u4ec0\u4e48\u8981\u5907\u6ce8\u7684\u5c31\u662f\u81ea\u5df1\u7ec3\u624b\u6015\u4fe1\u606f\u5427?<\/p>\n<hr>\n<pre class=\"lang:default decode:true\">\npackage com.htjf.main;\n\nimport java.io.File;\nimport java.io.FileWriter;\nimport java.io.IOException;\n\nimport org.jsoup.Jsoup;\nimport org.jsoup.helper.StringUtil;\nimport org.jsoup.nodes.Document;\nimport org.jsoup.nodes.Element;\nimport org.jsoup.select.Elements;\n\npublic class HelloWordJSoup {\n\tpublic static void main(String[] args) {\n\t\tfor(int i=1;i<=100;i++){\n\t\ttry {\n\t\t\tString url = \"http:\/\/www.ybzhan.cn\/Company\/a_t0\/list_p\"+i+\".html\";\n\t\t\tSystem.out.println(url);\n\t\t\tDocument doc = Jsoup.connect(url).get();\n\t\t\tElements companyLists =  doc.select(\".companyList\");\n\t\t\tStringBuffer stringBuffer=new StringBuffer();\n\t\t\tfor (Element companyList : companyLists) {\n\t\t\t\t\/\/\u516c\u53f8\u540d\n\t\t\t\tElement companyNameDiv =  companyList.select(\"div.companyName\").first();\n\t\t\t\tElement link  = companyNameDiv.select(\"a\").first();\n\t\t\t\t\n\t\t\t\tString shopUrl =  \"http:\/\/www.ybzhan.cn\"+link.attr(\"href\");\n\t\t\t\tString companyName = link.text();\n\t\t\t\t\/\/\u4e3b\u8425\u4ea7\u54c1\n\t\t\t\tElement ps =  companyList.select(\"dt > p\").first();\n\t\t\t\tString mainProducts = ps.text().replace(\"\u4e3b\u8425\u4ea7\u54c1\", \"\");\n\t\t\t\t\/\/\u4ecb\u7ecd\u9875\u9762\n\t\t\t\tString personalityUrl = \"\";\n\t\t\t\tString shopUrl2 = \"\";\n\t\t\t\tString companyUrl = \"\";\n\t\t\t\tSystem.out.println(shopUrl);\n\t\t\t\tif(!StringUtil.isBlank(shopUrl)&&!shopUrl.contains(\"Company\/Detail\")){\n\t\t\t\t\tDocument contactusDoc= Jsoup.connect( shopUrl+\"\/contactus.html\").get();\n\t\t\t\t\tElements ss = contactusDoc.getElementsByTag(\"p\");\n\t\t\t\t\tfor (Element element : ss) {\n\t\t\t\t\t\tif(element.text().contains(\"\u4e2a \u6027 \u5316\")){\n\t\t\t\t\t\t\tpersonalityUrl = element.text();\n\t\t\t\t\t\t}else if(element.text().contains(\"\u5546\u94fa\u7f51\u5740\")){\n\t\t\t\t\t\t\tshopUrl2 = element.text();\n\t\t\t\t\t\t}else if(element.text().contains(\"\u516c\u53f8\u7f51\u7ad9\")){\n\t\t\t\t\t\t\tcompanyUrl = element.text();\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t\tif(StringUtil.isBlank(personalityUrl)||StringUtil.isBlank(shopUrl2)||StringUtil.isBlank(companyUrl)){\n\t\t\t\t\t\tElements dl = contactusDoc.getElementsByTag(\"dl\");\n\t\t\t\t\t\tfor (Element element : dl) {\n\t\t\t\t\t\t\tif(element.text().contains(\"\u4e2a \u6027 \u5316\")){\n\t\t\t\t\t\t\t\tif(StringUtil.isBlank(personalityUrl)){\n\t\t\t\t\t\t\t\t\tpersonalityUrl = element.text();\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t}else if(element.text().contains(\"\u5546\u94fa\u7f51\u5740\")){\n\t\t\t\t\t\t\t\tif(StringUtil.isBlank(shopUrl2)){\n\t\t\t\t\t\t\t\t\tshopUrl2 = element.text();\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t}else if(element.text().contains(\"\u516c\u53f8\u7f51\u7ad9\")){\n\t\t\t\t\t\t\t\tif(StringUtil.isBlank(companyUrl)){\n\t\t\t\t\t\t\t\t\tcompanyUrl = element.text();\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\t\/\/\u5199\u5165\n\t\t\t\tstringBuffer.append(companyName+\";\");\n\t\t\t\tstringBuffer.append(mainProducts+\";\");\n\t\t\t\tif(StringUtil.isBlank(shopUrl2.trim())){\n\t\t\t\t\tstringBuffer.append(shopUrl+\";\");\n\t\t\t\t}else{\n\t\t\t\t\tstringBuffer.append(shopUrl2.replace(\"\u5546\u94fa\u7f51\u5740\uff1a\", \"\").trim()+\";\");\n\t\t\t\t}\n\t\t\t\tif(StringUtil.isBlank(companyUrl.trim())){\n\t\t\t\t\tstringBuffer.append(shopUrl+\";\");\n\t\t\t\t}else{\n\t\t\t\t\tstringBuffer.append(companyUrl.replace(\"\u516c\u53f8\u7f51\u7ad9\uff1a\", \"\").trim()+\";\");\n\t\t\t\t}\n\t\t\t\tif(StringUtil.isBlank(personalityUrl.trim())){\n\t\t\t\t\tstringBuffer.append(shopUrl);\n\t\t\t\t}else{\n\t\t\t\t\tstringBuffer.append(personalityUrl.replace(\"\u4e2a \u6027 \u5316\uff1a\", \"\").trim());\n\t\t\t\t}\n\t\t\t\tstringBuffer.append(System.lineSeparator());\/\/\u6362\u884c\n\t\t\t}\n\t\t\tnew HelloWordJSoup().writerData(stringBuffer);\n\t\t\tstringBuffer.setLength(0);\n\t\t} catch (IOException e) {\n\t\t\t\/\/ TODO Auto-generated catch block\n\t\t\te.printStackTrace();\n\t\t}\n\t\t}\n\t}\n\tpublic void writerData(StringBuffer stringBuffer){\n\t\tFileWriter out = null;\n\t\tString fileName = \"G:\"+File.separator+\"pushFile_test\"+File.separator+\"data.csv\";\n\t\tFile writeFile  = new File(fileName); \/\/\u6587\u4ef6\u8def\u5f84\u540d\n\t\tif(!writeFile.exists()&&!writeFile.isFile()){\/\/ \u5982\u679c\u6587\u4ef6\u4e0d\u5b58\u5728,\u521b\u5efa\u6587\u4ef6\n\t\t\ttry {\n\t\t\t\twriteFile.createNewFile();\n\t\t\t} catch (IOException e) {\n\t\t\t\t\/\/ TODO Auto-generated catch block\n\t\t\t}\n\t\t}\n\t\t\n\t\ttry {\n\t\t\tout = new FileWriter(writeFile,true);\n\t\t\tif(stringBuffer.length()>0){\n\t\t\t\tout.write(stringBuffer.toString());\n\t\t\t}\n\t\t\tout.flush();\n\t\t\tout.close();\n\t\t} catch (IOException e) {\n\t\t\t\/\/ TODO Auto-generated catch block\n\t\t\te.printStackTrace();\n\t\t}\n\t\tstringBuffer.setLength(0);\n\t} \n}\n\n<\/pre>\n<p>\u7ed3\u675f&#8230;<\/p>\n<div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='http:\/\/qn.199604.com\/wp-content\/uploads\/2018\/08\/20180801232922.jpg'><img class=\"lazyload lazyload-style-1\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  decoding=\"async\" data-original=\"http:\/\/qn.199604.com\/wp-content\/uploads\/2018\/08\/20180801232922.jpg\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" \/><\/div>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>emmmm&#8230;\u597d\u50cf\u6ca1\u6709\u4ec0\u4e48\u8981\u5907\u6ce8\u7684\u5c31\u662f\u81ea\u5df1\u7ec3\u624b\u6015\u4fe1\u606f\u5427? package com.htjf.main [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[130],"tags":[209],"class_list":["post-1157","post","type-post","status-publish","format-standard","hentry","category-java","tag-jsoup"],"_links":{"self":[{"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/posts\/1157","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/comments?post=1157"}],"version-history":[{"count":3,"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/posts\/1157\/revisions"}],"predecessor-version":[{"id":1161,"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/posts\/1157\/revisions\/1161"}],"wp:attachment":[{"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/media?parent=1157"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/categories?post=1157"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/199604.com\/wp-json\/wp\/v2\/tags?post=1157"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}