提交 8f456e71 作者: liuweigang

采集代码更新

上级 0da05fc2
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -185,9 +185,9 @@ ...@@ -185,9 +185,9 @@
<dependency> <dependency>
<groupId>mysql-connector-java</groupId> <groupId>mysql-connector-java</groupId>
<artifactId>mysql-connector-java</artifactId> <artifactId>mysql-connector-java</artifactId>
<version>5.1.7-bin</version> <version>5.1.9</version>
<scope>system</scope> <scope>system</scope>
<systemPath>${pom.basedir}/lib/mysql-connector-java-5.1.7-bin.jar</systemPath> <systemPath>${pom.basedir}/lib/mysql-connector-java-5.1.9.jar</systemPath>
</dependency> </dependency>
<dependency> <dependency>
...@@ -338,7 +338,7 @@ ...@@ -338,7 +338,7 @@
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
<artifactId>druid</artifactId> <artifactId>druid</artifactId>
<version>1.0.5</version> <version>1.1.10</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
......
...@@ -5,6 +5,15 @@ import com.zzsn.job.KafkaConsumerJob; ...@@ -5,6 +5,15 @@ import com.zzsn.job.KafkaConsumerJob;
import com.zzsn.search.MetaBaiduSearchThread; import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.entity.KeywordMsg; import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.util.SpringContextUtil; import com.zzsn.search.util.SpringContextUtil;
import com.zzsn.utility.index.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.boot.CommandLineRunner; import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
...@@ -15,7 +24,11 @@ import org.springframework.boot.web.servlet.support.SpringBootServletInitializer ...@@ -15,7 +24,11 @@ import org.springframework.boot.web.servlet.support.SpringBootServletInitializer
import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Bean;
import javax.servlet.MultipartConfigElement; import javax.servlet.MultipartConfigElement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Properties;
@Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn") @SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerMateSearchApplication extends SpringBootServletInitializer implements CommandLineRunner { public class CrawlerMateSearchApplication extends SpringBootServletInitializer implements CommandLineRunner {
@Override @Override
...@@ -28,26 +41,76 @@ public class CrawlerMateSearchApplication extends SpringBootServletInitializer i ...@@ -28,26 +41,76 @@ public class CrawlerMateSearchApplication extends SpringBootServletInitializer i
@Override @Override
public void run(String... args) throws Exception { public void run(String... args) throws Exception {
consumerPartition();
// System.out.println("——————++++++++++++——————==="); // System.out.println("——————++++++++++++——————===");
// String key="{\n" + // String key="{\n" +
// " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.KeyWordsDTO\",\n" + // " \"id\": \"1532331241232039937\",\n" +
// " \"id\": \"1532255512859410433\",\n" + // " \"wordsCode\": \"KW-20220602-0003\",\n" +
// " \"wordsCode\": \"KW-20220602-0001\",\n" + // " \"wordsName\": \"人工智能应用\",\n" +
// " \"wordsName\": \"供应链25强企业\",\n" + // " \"keyWord\": \"(人工智能|人工智能应用|应用|人工智能技术|人工智能领域|人工智能系统|人工智能产品|智能汽车|无人驾驶|人脸识别|人像识别|面部识别|机器翻译|自然语言处理|声纹识别|智能客服|智能音箱|语音识别|语音合成|个性化推荐|图像识别|图像搜索|人工智能应用|大数据分析|大数据|人工智能设计|人机交互|人工智能方案|人工智能解决方案|人工智能实验室|人工智能模型|人工智能问题|人工智能流程|人工智能设备|生成式对抗网络|计算智能|感知智能|认知智能|机器学习|增强学习|结构化数据|非结构化数据|传感器|理解能力|归纳能力|推理能力|特征提取|模式分析|预测|智能农业|智能工业|智能工厂|工业机器人|智能手机|无人驾驶汽车|无人机|智能机器人|环境感知|路径规划|行为决策|算法|智能分拣|设备健康管理|表面缺陷检测|智能决策|数字孪生|创成式设计|需求预测|供应链优化|深度学习|Applications of artificial intelligence|artificial intelligence|Applications|AI|Driverless Car|Automatic Speech Recognition|ASR|Natural Language Processing|NLP|Text To Speech|TTS|GAN| generative adversarial network|SLAM|simultaneous localization and mapping|Generative Design|AI Application in E-Commerce|Personalized Shopping|AI-powered Assistants|Fraud Prevention| Applications Of Artificial Intelligence in Education|Administrative Tasks Automated to Aid Educators|Administrative Tasks Automated to Aid Educators|Creating Smart Content|Voice Assistants|Personalized Learning|Applications of Artificial Intelligence in Lifestyle|Autonomous Vehicles|Spam Filters|Facial Recognition|Recommendation System|Applications of Artificial intelligence in Navigation|Applications of Artificial Intelligence in Robotics|Applications of Artificial Intelligence in Human Resource|Applications of Artificial Intelligence in Healthcare|Applications of Artificial Intelligence in Agriculture|Applications of Artificial Intelligence in Gaming|Applications of Artificial Intelligence in Automobiles|Applications of Artificial Intelligence in Social Media|Applications of Artificial Intelligence in Marketing| Applications of Artificial Intelligence in Chatbots|Applications of of Artificial Intelligence in Finance)+(人工智能|artificial intelligence|AI)\\n\",\n" +
// " \"keyWord\": \"(强生|百事可乐|辉瑞|英特尔|雀巢|联想|微软|欧莱雅 |可口可乐|耐克|沃尔玛|惠普|迪阿吉奥|戴尔科技|索引|宝马|艾伯维|西门子|阿斯利康|通用磨坊|英美烟草|阿里巴巴)+(供应链)\",\n" +
// " \"exclusionWord\": null,\n" + // " \"exclusionWord\": null,\n" +
// " \"status\": \"1\",\n" + // " \"status\": \"1\",\n" +
// " \"subjectId\": null,\n" + // " \"subjectId\": null,\n" +
// " \"subjectIds\": null,\n" + // " \"subjectIds\": null,\n" +
// " \"startTime\":\"1622563200000\",\n" + // " \"startTime\": null,\n" +
// " \"endTime\": \"1654099200000\"\n" + // " \"endTime\": null \n" +
// "}"; // "}";
// KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class); // try {
// MetaBaiduSearchThread metaSearchThread=new MetaBaiduSearchThread(); // KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class);
// metaSearchThread.keywordMsg=keywordMsg; // MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
// metaSearchThread.crawler(); // metaSearchThread.keywordMsg = keywordMsg;
// metaSearchThread.crawler();
// }catch (Exception e){
// e.printStackTrace();
// }
} }
public void consumerPartition (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
for(ConsumerRecord record : records){
try {
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
log.info("关键词解析keywordMsg正常");
consumer.commitSync();
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
log.info("关键词请求结束++++");
}catch (Exception e){
log.info("关键词解析异常: "+record.value().toString());
}
}
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
} }
...@@ -80,9 +80,9 @@ public class KafkaConsumerJob { ...@@ -80,9 +80,9 @@ public class KafkaConsumerJob {
} }
@Scheduled(cron = "0 0/2 * * * ?") // @Scheduled(cron = "0 0/2 * * * ?")
@Async("asyncTaskExecutor") @Async("asyncTaskExecutor")
public void consumer_partition (){ public void consumerPartition (){
log.info("定时获取mq消息"); log.info("定时获取mq消息");
//1.创建消费者 //1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer(); KafkaConsumer<String, String> consumer = createConsumer();
......
...@@ -70,8 +70,8 @@ public class BaiduSearchThread implements Runnable { ...@@ -70,8 +70,8 @@ public class BaiduSearchThread implements Runnable {
public KeywordMsg keywordMsg; public KeywordMsg keywordMsg;
public List<String> keywords; public List<String> keywords;
public Integer threadId; public Integer threadId;
String startTime="2016-01-01"; String startTime="2021-09-01";
String endTime="2022-06-01"; String endTime="2022-07-11";
String cache_key="baidu_keyWords"; String cache_key="baidu_keyWords";
@Override @Override
...@@ -123,28 +123,32 @@ public class BaiduSearchThread implements Runnable { ...@@ -123,28 +123,32 @@ public class BaiduSearchThread implements Runnable {
}catch (Exception e){ }catch (Exception e){
log.info("缓存出问题"); log.info("缓存出问题");
} }
String url1= "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=[pn]"; String url1= "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]";
// String url1= "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=[pn]";
// String url1= "https://www.baidu.com/s?wd=[keyword]&pn=[pn]&oq=[keyword]&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&gpc=stf=[startTime],[endTime]|stftype=2&tfflag=1"; // String url1= "https://www.baidu.com/s?wd=[keyword]&pn=[pn]&oq=[keyword]&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&gpc=stf=[startTime],[endTime]|stftype=2&tfflag=1";
String url=""; String url="";
List<String> urlList = new ArrayList<String>(); List<String> urlList = new ArrayList<String>();
log.info("url:" + url); log.info("url:" + url);
String charset = "utf-8"; String charset = "utf-8";
Long orgId = Long.parseLong("2022060701"); // Long orgId = Long.parseLong("2022082801");
Long tid = Long.parseLong("2022060701"); Long orgId = Long.parseLong("202208290111");
Long tid = Long.parseLong("202208290111");
String proxyid=Constants.PROXY; String proxyid=Constants.PROXY;
if(proxyid.equals("1")) { if(proxyid.equals("1")) {
CatchWebNews(RecorderUtil.CatchWebOfBaiduByProxy(urlList, charset, orgId, tid),kWord); CatchWebNews(RecorderUtil.CatchWebOfBaiduByProxy(urlList, charset, orgId, tid),kWord);
}else { }else {
for (int i = 0; i < 100; i++) { // for (int i = 0; i < 2; i++) {
String urla = url1.replace("[keyword]",kWord); // String urla = url1.replace("[keyword]",kWord);
// urla = urla.replace("[startTime]",startTime); //// urla = urla.replace("[startTime]",startTime);
// urla = urla.replace("[endTime]",endTime); //// urla = urla.replace("[endTime]",endTime);
urla=urla.replace("[pn]",i*10+""); // urla=urla.replace("[pn]",i*10+"");
urlList.add(urla); // urlList.add(urla);
} // }
// List<CatchWebByMetaSearch> catchWebByMetaSearches = RecorderUtil.catchWebOfBaiduList(urlList, charset, orgId, tid); // List<CatchWebByMetaSearch> catchWebByMetaSearches = RecorderUtil.catchWebOfBaiduList(urlList, charset, orgId, tid);
try { try {
String urla = url1.replace("[keyword]",kWord);
urlList.add(urla);
RecorderUtil.CatchWebDetailOfBaidu(urlList, charset, orgId, tid, kWord); RecorderUtil.CatchWebDetailOfBaidu(urlList, charset, orgId, tid, kWord);
}catch (Exception e){ }catch (Exception e){
e.printStackTrace(); e.printStackTrace();
......
...@@ -25,7 +25,7 @@ public class DBConnection { ...@@ -25,7 +25,7 @@ public class DBConnection {
} }
// 获取数据库的连接 // 获取数据库的连接
try { try {
Connection conn = java.sql.DriverManager.getConnection("jdbc:mysql://localhost:3306/clb_project?useUnicode=true&characterEncoding=utf8", "root", "root"); Connection conn = java.sql.DriverManager.getConnection("jdbc:mysql://192.168.1.164:3306/clb_project?useUnicode=true&characterEncoding=utf8", "root", "root");
return conn; return conn;
} catch (SQLException e1) { } catch (SQLException e1) {
e1.printStackTrace(); e1.printStackTrace();
......
...@@ -30,5 +30,10 @@ public class KeywordMsg { ...@@ -30,5 +30,10 @@ public class KeywordMsg {
private Long startTime; private Long startTime;
private Long endTime; private Long endTime;
//需要启动的信息采集器
private List<String> searchEngines;
//采集的要求(1:标题 2:正文 3:全文)
private String crawlerType;
} }
...@@ -83,6 +83,7 @@ public class PublishDateUtil { ...@@ -83,6 +83,7 @@ public class PublishDateUtil {
public static SimpleDateFormat formatter11_1 = new SimpleDateFormat("MMM dd, yyyy",Locale.ENGLISH); public static SimpleDateFormat formatter11_1 = new SimpleDateFormat("MMM dd, yyyy",Locale.ENGLISH);
public static SimpleDateFormat formatter12 = new SimpleDateFormat("dd-MMM-yyyy HH:mm",Locale.ENGLISH); public static SimpleDateFormat formatter12 = new SimpleDateFormat("dd-MMM-yyyy HH:mm",Locale.ENGLISH);
public static SimpleDateFormat formatter12_1 = new SimpleDateFormat("dd-MMM-yyyy",Locale.ENGLISH); public static SimpleDateFormat formatter12_1 = new SimpleDateFormat("dd-MMM-yyyy",Locale.ENGLISH);
// public static SimpleDateFormat formatter13_1 = new SimpleDateFormat("dd-MMM-yyyy",Locale.ITALY);
private static Date thresholdDate = null; private static Date thresholdDate = null;
......
package com.zzsn.search.util; package com.zzsn.search.util;
import cn.hutool.core.util.RandomUtil;
import com.zzsn.search.BaiduSearchThread; import com.zzsn.search.BaiduSearchThread;
import com.zzsn.search.MetaBaiduSearchThread; import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.oracledb.OracleDBManager; import com.zzsn.search.oracledb.OracleDBManager;
...@@ -322,15 +323,15 @@ public class RecorderUtil { ...@@ -322,15 +323,15 @@ public class RecorderUtil {
} catch (URISyntaxException e) { } catch (URISyntaxException e) {
log.info("url处理异常!"); log.info("url处理异常!");
} }
// docstr = proxyRequest(uri_code); docstr = proxyRequest(uri_code);
String proxyIP = getProxyIP(); // String proxyIP = getProxyIP();
log.info("使用的代理IP:"+proxyIP ); // log.info("使用的代理IP:"+proxyIP );
String[] proxys=proxyIP.split("-"); // String[] proxys=proxyIP.split("-");
String proxyHost = proxys[0]; // String proxyHost = proxys[0];
int proxyPort = Integer.parseInt(proxys[1]); // int proxyPort = Integer.parseInt(proxys[1]);
String userName = proxys[2]; // String userName = proxys[2];
String password = proxys[3]; // String password = proxys[3];
docstr = HttpClientProxy.build(proxyHost, proxyPort, userName, password).requestUrl(uri_code); // 代理认证 // docstr = HttpClientProxy.build(proxyHost, proxyPort, userName, password).requestUrl(uri_code); // 代理认证
log.info("请求内容:"+docstr); log.info("请求内容:"+docstr);
} catch (Exception e) { } catch (Exception e) {
log.info("使用代理请求异常"); log.info("使用代理请求异常");
...@@ -486,9 +487,9 @@ public class RecorderUtil { ...@@ -486,9 +487,9 @@ public class RecorderUtil {
} }
BaiduSearchThread baiduSearchThread=new BaiduSearchThread(); BaiduSearchThread baiduSearchThread=new BaiduSearchThread();
int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keyword); int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keyword);
if(repeat/metaSearchList.size()>0.7){ // if(repeat/metaSearchList.size()>0.7){
break; // break;
} // }
catchWebByMetaSearchList.addAll(metaSearchList); catchWebByMetaSearchList.addAll(metaSearchList);
} }
return catchWebByMetaSearchList; return catchWebByMetaSearchList;
...@@ -1305,7 +1306,8 @@ public class RecorderUtil { ...@@ -1305,7 +1306,8 @@ public class RecorderUtil {
} }
public static String getProxyIP(){ public static String getProxyIP(){
String searchSql = "select proxy from CIS_sys_Proxy where ID = 1";
String searchSql = "select proxy from CIS_sys_Proxy where ID = "+ RandomUtil.randomInt(5);
String proxy=""; String proxy="";
OracleDBManager dm = new OracleDBManager(); OracleDBManager dm = new OracleDBManager();
String[] coulmn = null; String[] coulmn = null;
...@@ -1326,6 +1328,7 @@ public class RecorderUtil { ...@@ -1326,6 +1328,7 @@ public class RecorderUtil {
} }
return proxy; return proxy;
} }
public static HttpClient getHttpClient() { public static HttpClient getHttpClient() {
String proxyIP = getProxyIP(); String proxyIP = getProxyIP();
log.info("使用的代理IP:"+proxyIP ); log.info("使用的代理IP:"+proxyIP );
......
...@@ -121,7 +121,7 @@ public class SplitKeyword { ...@@ -121,7 +121,7 @@ public class SplitKeyword {
} }
public static void main(String[] args) { public static void main(String[] args) {
String kwords="(思科系统|施耐德电气|高露洁棕榄|强生|百事可乐|辉瑞|英特尔|雀巢|联想|微软|欧莱雅 |可口可乐|耐克|沃尔玛|惠普|迪阿吉奥|戴尔科技|索引|宝马|艾伯维|西门子|阿斯利康|通用磨坊|英美烟草|阿里巴巴)+供应链"; String kwords="(国家能源投资集团有限责任公司|国家能源集团|国家能源投资集团|中国中煤能源集团有限公司|中煤能源集团|中煤集团|国家开发投资集团有限公司|国投|华润(集团)有限公司|华润|华润集团|中国华能集团有限公司|中国华能|华能集团|国家电力投资集团有限公司|国家电投|中国华电集团有限公司|中国华电|中国华电集团公司|中国大唐集团有限公司|中国大唐集团)+煤炭+(采购|销售|中长期合同|电力|保供|哄抬|倒卖)";
List<String> strings = transForm(kwords); List<String> strings = transForm(kwords);
for (String key :strings) { for (String key :strings) {
System.out.println(key); System.out.println(key);
......
...@@ -72,6 +72,7 @@ public class SeleniumTime { ...@@ -72,6 +72,7 @@ public class SeleniumTime {
chromeOptions.addArguments("headless");//无界面参数 chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天 chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
} }
chromeOptions.addArguments("headless");//无界面参数
driver = new ChromeDriver(chromeOptions);//生成实例 driver = new ChromeDriver(chromeOptions);//生成实例
String html=""; String html="";
try{ try{
...@@ -87,6 +88,7 @@ public class SeleniumTime { ...@@ -87,6 +88,7 @@ public class SeleniumTime {
System.out.println("browser will be close"); System.out.println("browser will be close");
WebElement webElement = driver.findElement(By.xpath("/html")); WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML"); html = webElement.getAttribute("outerHTML");
System.out.println("browser:"+html);
}catch(Exception e){ }catch(Exception e){
log.info("chromedriver 出现异常:"+e.getMessage()); log.info("chromedriver 出现异常:"+e.getMessage());
}finally { }finally {
......
...@@ -44,10 +44,11 @@ KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC=google_crawler_urlList ...@@ -44,10 +44,11 @@ KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC=google_crawler_urlList
#搜索地址 #搜索地址
#META_SEARCH_URL=https://www.google.com.hk/search?q=[keyword]&newwindow=1&tbs=cdr:1,cd_min:[startTime],cd_max:[endTime]&tbm=nws&ei=fYBfYp-CHffo2roPhoOPsA4&start=[pn] #META_SEARCH_URL=https://www.google.com.hk/search?q=[keyword]&newwindow=1&tbs=cdr:1,cd_min:[startTime],cd_max:[endTime]&tbm=nws&ei=fYBfYp-CHffo2roPhoOPsA4&start=[pn]
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word= #META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=[pn] #META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=[pn]
META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]
#META_SEARCH_URL=https://www.baidu.com/s?wd=[keyword]&pn=[pn]&oq=[keyword]&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&gpc=stf=[startTime],[endTime]|stftype=2&tfflag=1 #META_SEARCH_URL=https://www.baidu.com/s?wd=[keyword]&pn=[pn]&oq=[keyword]&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&gpc=stf=[startTime],[endTime]|stftype=2&tfflag=1
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\meta_crawler\\baidu_search\\data\\projectbak.txt META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\meta_crawler\\baidu_search\\data\\project.txt
# Redis settings # Redis settings
redis.host=127.0.0.1 redis.host=127.0.0.1
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
4.降低爬虫程序内部相应的等待时间,加快效率。 4.降低爬虫程序内部相应的等待时间,加快效率。
httpclient 添加代理请求
...@@ -19,6 +19,12 @@ ...@@ -19,6 +19,12 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>ru.yandex.qatools.ashot</groupId>
<artifactId>ashot</artifactId>
<version>1.5.4</version>
</dependency>
<dependency>
<groupId>com.huaweicloud</groupId> <groupId>com.huaweicloud</groupId>
<artifactId>esdk-obs-java-bundle</artifactId> <artifactId>esdk-obs-java-bundle</artifactId>
<version>[3.21.8,)</version> <version>[3.21.8,)</version>
......
...@@ -26,6 +26,7 @@ public class CrawlerCommVerifyController extends BaseController { ...@@ -26,6 +26,7 @@ public class CrawlerCommVerifyController extends BaseController {
@ResponseBody @ResponseBody
public String VerifyPageList(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){ public String VerifyPageList(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){
SiteInfoVerify siteInfoVerify=new SiteInfoVerify(); SiteInfoVerify siteInfoVerify=new SiteInfoVerify();
siteMsgTemple.setVerifyType("1");
VerifyResult verifyResult = siteInfoVerify.crawlerPageList(siteMsgTemple); VerifyResult verifyResult = siteInfoVerify.crawlerPageList(siteMsgTemple);
return MsgUtil.outSiteJSON(verifyResult); return MsgUtil.outSiteJSON(verifyResult);
} }
...@@ -34,6 +35,7 @@ public class CrawlerCommVerifyController extends BaseController { ...@@ -34,6 +35,7 @@ public class CrawlerCommVerifyController extends BaseController {
@ResponseBody @ResponseBody
public String VerifyDetailMsg(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){ public String VerifyDetailMsg(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){
SiteInfoVerify siteInfoVerify=new SiteInfoVerify(); SiteInfoVerify siteInfoVerify=new SiteInfoVerify();
// siteMsgTemple.setVerifyType("1");
VerifyResult verifyResult = siteInfoVerify.crawlerDetialMsg(siteMsgTemple); VerifyResult verifyResult = siteInfoVerify.crawlerDetialMsg(siteMsgTemple);
return MsgUtil.outSiteJSON(verifyResult); return MsgUtil.outSiteJSON(verifyResult);
} }
...@@ -42,6 +44,7 @@ public class CrawlerCommVerifyController extends BaseController { ...@@ -42,6 +44,7 @@ public class CrawlerCommVerifyController extends BaseController {
@ResponseBody @ResponseBody
public String VerifyScreenshot(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){ public String VerifyScreenshot(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){
SiteInfoVerify siteInfoVerify=new SiteInfoVerify(); SiteInfoVerify siteInfoVerify=new SiteInfoVerify();
// siteMsgTemple.setVerifyType("1");
VerifyResult verifyResult = siteInfoVerify.crawlerScreenshot(siteMsgTemple); VerifyResult verifyResult = siteInfoVerify.crawlerScreenshot(siteMsgTemple);
return MsgUtil.outSiteJSON(verifyResult); return MsgUtil.outSiteJSON(verifyResult);
} }
......
...@@ -26,7 +26,7 @@ public class DBConnection { ...@@ -26,7 +26,7 @@ public class DBConnection {
} }
// 获取数据库的连接 // 获取数据库的连接
try { try {
Connection conn = java.sql.DriverManager.getConnection("jdbc:mysql://localhost:3306/clb_project?useUnicode=true&characterEncoding=utf8", "root", "root"); Connection conn = java.sql.DriverManager.getConnection("jdbc:mysql://localhost:3306/clb_project?useUnicode=true&characterEncoding=utf8&serverTimezone=Asia/Shanghai", "root", "root");
return conn; return conn;
} catch (SQLException e1) { } catch (SQLException e1) {
e1.printStackTrace(); e1.printStackTrace();
......
...@@ -6,6 +6,7 @@ import com.jayway.jsonpath.JsonPath; ...@@ -6,6 +6,7 @@ import com.jayway.jsonpath.JsonPath;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
...@@ -185,11 +186,15 @@ public class WebContentPaserByJsonXpath { ...@@ -185,11 +186,15 @@ public class WebContentPaserByJsonXpath {
count++; count++;
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
// boolean sismember = JedisUtil.sismember(cwbm.getSourceaddress(), "1"); String rediskey = siteMsgTemple.getInfoSourceCode();
boolean sismember = JedisUtil.exists(cwbm.getSourceaddress()); try {
if(sismember){ boolean sismember = JedisUtil.sismember(rediskey, cwbm.getSourceaddress());
log.info("栏目信息重复:"+siteMsgTemple.getSiteName()+" :" +cwbm.getSourceaddress()); if (sismember) {
continue; log.info("栏目信息重复:" + siteMsgTemple.getSiteName() + " :" + cwbm.getSourceaddress());
continue;
}
} catch (Exception e) {
log.info("缓存出问题");
} }
// 请求下载内容 // 请求下载内容
// String content = getContent(cwbm); // String content = getContent(cwbm);
...@@ -252,13 +257,13 @@ public class WebContentPaserByJsonXpath { ...@@ -252,13 +257,13 @@ public class WebContentPaserByJsonXpath {
// } // }
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
docInfoList.add(docInfo); docInfoList.add(docInfo);
JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
} catch (JsonProcessingException e) { } catch (JsonProcessingException e) {
// e.printStackTrace(); // e.printStackTrace();
log.info("发送到kafka失败。"); log.info("发送到kafka失败。");
continue; continue;
} }
JedisUtil.setString(cwbm.getSourceaddress(),"1",-1);
} catch (Exception e){ } catch (Exception e){
continue; continue;
...@@ -291,11 +296,13 @@ public class WebContentPaserByJsonXpath { ...@@ -291,11 +296,13 @@ public class WebContentPaserByJsonXpath {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(),cwbm.getCharset(),true,false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(),cwbm.getCharset(),true,false);
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
// if(StringUtils.isEmpty(content)) { if(StringUtils.isEmpty(content)) {
// SeleniumTime seleniumTime = new SeleniumTime(); if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
// content = seleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getVerifyScopehtml(cwbm.getSourceaddress());
// seleniumTime.close(); }else {
// } content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
}
} }
} }
} }
......
...@@ -7,6 +7,7 @@ import com.zzsn.crawler.PaserSiteDownload; ...@@ -7,6 +7,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
...@@ -41,6 +42,7 @@ import javax.xml.transform.dom.DOMSource; ...@@ -41,6 +42,7 @@ import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathExpressionException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter; import java.io.StringWriter;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
...@@ -49,6 +51,7 @@ import java.security.KeyStoreException; ...@@ -49,6 +51,7 @@ import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -99,7 +102,11 @@ public class WebContentPaserByXpath { ...@@ -99,7 +102,11 @@ public class WebContentPaserByXpath {
} }
} }
if ( StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) { if ( StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(uri_code); if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
body = SeleniumTime.getVerifyScopehtml(uri_code);
}else {
body = SeleniumTime.getScopehtml(uri_code);
}
} }
} }
// if(StringUtils.isEmpty(body)){ // if(StringUtils.isEmpty(body)){
...@@ -107,12 +114,15 @@ public class WebContentPaserByXpath { ...@@ -107,12 +114,15 @@ public class WebContentPaserByXpath {
// } // }
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){ body = SeleniumTime.getScopehtml(uri_code);
String imagUrl=""; catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
WebPageScreenShot webPageScreenShot=new WebPageScreenShot(); }
webPageScreenShot.loadPage(uri_code,Constants.IMGPATH); if(catchWebByMetaSearches.size()<1){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue;
} }
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
if(seleniumTime!=null) { if(seleniumTime!=null) {
...@@ -295,11 +305,15 @@ public class WebContentPaserByXpath { ...@@ -295,11 +305,15 @@ public class WebContentPaserByXpath {
if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length()==0|| cwbm.getSourceaddress().contains(".PDF")||cwbm.getSourceaddress().contains("download")) { if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length()==0|| cwbm.getSourceaddress().contains(".PDF")||cwbm.getSourceaddress().contains("download")) {
continue; continue;
} }
//boolean sismember = JedisUtil.sismember(cwbm.getSourceaddress(), "1"); String rediskey = siteMsgTemple.getInfoSourceCode();
boolean sismember = JedisUtil.exists(cwbm.getSourceaddress()); try {
if(sismember){ boolean sismember = JedisUtil.sismember(rediskey, cwbm.getSourceaddress());
log.info("栏目信息重复:"+siteMsgTemple.getSiteName()+" :" +cwbm.getSourceaddress()); if (sismember) {
continue; log.info("栏目信息重复:" + siteMsgTemple.getSiteName() + " :" + cwbm.getSourceaddress());
continue;
}
} catch (Exception e) {
log.info("缓存出问题");
} }
// 请求下载内容 // 请求下载内容
String content=""; String content="";
...@@ -343,11 +357,6 @@ public class WebContentPaserByXpath { ...@@ -343,11 +357,6 @@ public class WebContentPaserByXpath {
} }
} }
} }
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl="";
WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
webPageScreenShot.loadPage(cwbm.getSourceaddress(),Constants.IMGPATH);
}
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId()); docInfo.setOrgId(cwbm.getOrgId());
...@@ -387,6 +396,17 @@ public class WebContentPaserByXpath { ...@@ -387,6 +396,17 @@ public class WebContentPaserByXpath {
}else{ }else{
processitem.setSource("1"); processitem.setSource("1");
} }
//使用浏览器截取图片
if (StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")) {
String imagUrl = "";
// WebPageScreenShot webPageScreenShot = new WebPageScreenShot();
// webPageScreenShot.loadPage(cwbm.getSourceaddress(), Constants.IMGPATH);
InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
imagUrl=map.get("objectUrl").toString();
processitem.setScreenShotImg(imagUrl);
}
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
int partition=0; int partition=0;
...@@ -403,8 +423,8 @@ public class WebContentPaserByXpath { ...@@ -403,8 +423,8 @@ public class WebContentPaserByXpath {
log.info("发送到kafka失败。"); log.info("发送到kafka失败。");
continue; continue;
} }
JedisUtil.setString(cwbm.getSourceaddress(),"1",-1); // JedisUtil.setString(cwbm.getSourceaddress(),"1",-1);
JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
} catch (Exception e){ } catch (Exception e){
if(seleniumTime!=null) { if(seleniumTime!=null) {
seleniumTime.close(); seleniumTime.close();
...@@ -434,17 +454,21 @@ public class WebContentPaserByXpath { ...@@ -434,17 +454,21 @@ public class WebContentPaserByXpath {
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
seleniumTime=new SeleniumTime(); if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
content = seleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getVerifyScopehtml(cwbm.getSourceaddress());
seleniumTime.close(); }else {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
}else{ }else{
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
if(StringUtils.isEmpty(content)) { if(StringUtils.isEmpty(content)) {
seleniumTime = new SeleniumTime(); if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
content = seleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getVerifyScopehtml(cwbm.getSourceaddress());
seleniumTime.close(); }else {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
} }
} }
} }
...@@ -458,9 +482,11 @@ public class WebContentPaserByXpath { ...@@ -458,9 +482,11 @@ public class WebContentPaserByXpath {
} else { } else {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if (StringUtils.isEmpty(content)) { if (StringUtils.isEmpty(content)) {
seleniumTime=new SeleniumTime(); if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
content = seleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getVerifyScopehtml(cwbm.getSourceaddress());
seleniumTime.close(); }else {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
} }
} }
} }
......
...@@ -13,6 +13,7 @@ import com.zzsn.generation.Constants; ...@@ -13,6 +13,7 @@ import com.zzsn.generation.Constants;
import com.zzsn.util.DriverUtil; import com.zzsn.util.DriverUtil;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.*; import org.openqa.selenium.*;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService; import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.chrome.ChromeOptions;
...@@ -24,6 +25,42 @@ public class SeleniumTime { ...@@ -24,6 +25,42 @@ public class SeleniumTime {
public SeleniumTime(){ public SeleniumTime(){
} }
// @Async("asyncTaskExecutorSelenium")
public static String getVerifyScopehtml(String url) {
String html = "";
ChromeOptions chromeOptions = new ChromeOptions();
ChromeDriver driver;
ChromeDriverService service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try {
service.start();
if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
}
driver = new ChromeDriver(chromeOptions);//生成实例
try {
Duration duration=Duration.of(100, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url);
Thread.sleep(100);
try {
WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML");
} catch (Exception e) {
log.info("获取页面内容异常:" + e.getMessage());
}
} catch (Exception e) {
// 若驱动Session连接异常,则直接退出驱动并在下次访问得的时候重新打开驱动
log.info("驱动打开URL异常:" + e.getMessage());
}
} catch (Exception e) {
log.info("驱动访问页面出现出现异常:" + e.getMessage());
}
return html;
}
/** /**
* 调用驱动获取html信息 * 调用驱动获取html信息
* @param url 网页地址 * @param url 网页地址
...@@ -38,7 +75,7 @@ public class SeleniumTime { ...@@ -38,7 +75,7 @@ public class SeleniumTime {
Duration duration=Duration.of(100, ChronoUnit.SECONDS); Duration duration=Duration.of(100, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration); driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url); driver.get(url);
Thread.sleep(10002); Thread.sleep(1000);
try { try {
WebElement webElement = driver.findElement(By.xpath("/html")); WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML"); html = webElement.getAttribute("outerHTML");
...@@ -71,6 +108,9 @@ public class SeleniumTime { ...@@ -71,6 +108,9 @@ public class SeleniumTime {
chromeOptions.addArguments("headless");//无界面参数 chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天 chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
} }
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver = new ChromeDriver(service, chromeOptions);//生成实例 driver = new ChromeDriver(service, chromeOptions);//生成实例
InputStream inStream = null; InputStream inStream = null;
try{ try{
...@@ -78,6 +118,9 @@ public class SeleniumTime { ...@@ -78,6 +118,9 @@ public class SeleniumTime {
driver.get(url); driver.get(url);
Thread.sleep(3000l); Thread.sleep(3000l);
try { try {
long width = (long) driver.executeScript("return document.body.scrollWidth");
long height = (long) driver.executeScript("return document.body.scrollHeight");
driver.manage().window().setSize(new Dimension((int) width, (int) height));
byte[] screenshotBytes = driver.getScreenshotAs(OutputType.BYTES); byte[] screenshotBytes = driver.getScreenshotAs(OutputType.BYTES);
inStream = new ByteArrayInputStream(screenshotBytes); inStream = new ByteArrayInputStream(screenshotBytes);
// File src = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE); // File src = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
......
...@@ -41,8 +41,8 @@ public class WebPageScreenShot { ...@@ -41,8 +41,8 @@ public class WebPageScreenShot {
// driver.manage().window().maximize(); // driver.manage().window().maximize();
String js1 = "return document.body.clientHeight.toString()"; String js1 = "return document.body.clientHeight.toString()";
// String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + ""; String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + "";
// int height = Integer.parseInt(js1_result); int height = Integer.parseInt(js1_result);
List<String> files = new ArrayList<String>(); List<String> files = new ArrayList<String>();
int last_t = 0; int last_t = 0;
// for (int i = 0; i < 20; ) { // for (int i = 0; i < 20; ) {
...@@ -80,7 +80,7 @@ public class WebPageScreenShot { ...@@ -80,7 +80,7 @@ public class WebPageScreenShot {
CustomScreenshot customScreenshot=new CustomScreenshot(); CustomScreenshot customScreenshot=new CustomScreenshot();
files.add(customScreenshot.fullScreenshotLong(driver).getAbsolutePath()); files.add(customScreenshot.fullScreenshotLong(driver).getAbsolutePath());
driver.quit();//退出浏览器 driver.quit();//退出浏览器
// boolean flag = merge(files.toArray(new String[]{}), type, resultPath); boolean flag = merge(files.toArray(new String[]{}), type, resultPath);
// if(flag){ // if(flag){
// InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath)); // InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath));
// HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png"); // HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png");
...@@ -124,7 +124,7 @@ public class WebPageScreenShot { ...@@ -124,7 +124,7 @@ public class WebPageScreenShot {
}); });
ByteArrayInputStream bytes = new ByteArrayInputStream(imageBytes); ByteArrayInputStream bytes = new ByteArrayInputStream(imageBytes);
BufferedImage image = ImageIO.read(bytes); BufferedImage image = ImageIO.read(bytes);
File file = File.createTempFile((new Random()).nextInt()+"",type); File file = File.createTempFile((new Random()).nextInt()+"","."+type);
ImageIO.write(image, "png", file); ImageIO.write(image, "png", file);
return file; return file;
} }
...@@ -208,7 +208,7 @@ public class WebPageScreenShot { ...@@ -208,7 +208,7 @@ public class WebPageScreenShot {
public static void main(String[] args) throws IOException, InterruptedException { public static void main(String[] args) throws IOException, InterruptedException {
WebPageScreenShot screenShot=new WebPageScreenShot(); WebPageScreenShot screenShot=new WebPageScreenShot();
screenShot.loadPage("https://cicftz.sufe.edu.cn/59/03/c4480a153859/page.htm",Constants.IMGPATH); screenShot.loadPage("https://www.lamayor.org/mayor-garcetti-and-congressman-gomez-celebrate-1-millon-funding-boyle-heights-community-cooling","E:\\chrome\\img\\shot.png");
} }
......
...@@ -15,59 +15,60 @@ public class ArticleCrawler { ...@@ -15,59 +15,60 @@ public class ArticleCrawler {
public void consumer(){ public void consumer(){
String record="{\n" + String record="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1511551439137689601\",\n" + " \"id\": \"1560150270181019650\",\n" +
" \"infoSourceCode\": \"IN-20220406-0392\",\n" + " \"infoSourceCode\": \"IN-20220818-0011\",\n" +
" \"webSiteName\": \"中国国资国企产业创新战略联盟\",\n" + " \"webSiteName\": \"一带一路-项目周报\",\n" +
" \"siteName\": \"中国国资国企产业创新战略联盟-市场观察\",\n" + " \"siteName\": \"一带一路-项目周报\",\n" +
" \"siteUri\": \"http://www.guozi.org.cn/list/?7_1.html\",\n" + " \"siteUri\": \"https://www.yidaiyilu.gov.cn/info/iList.jsp?cat_id=11432&cur_page=3\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"language\": \"zh\",\n" + " \"siteLevel\": null,\n" +
" \"language\": null,\n" +
" \"checkedList\": null,\n" + " \"checkedList\": null,\n" +
" \"hisUriExp\": \"http://www.guozi.org.cn/list/?7_<num minBit=2 start=2 end=174/>.html\",\n" + " \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" + " \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": null,\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": \"1\",\n" + " \"status\": null,\n" +
" \"listUrl\": null,\n" + " \"listUrl\": null,\n" +
" \"listExpressionType\": \"0\",\n" + " \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": \"^http://www\\\\.guozi\\\\.org\\\\.cn/content/\\\\?[\\\\d]{1,}\\\\.html\",\n" + " \"informationUrl\": null,\n" +
" \"informationTitle\": null,\n" + " \"informationTitle\": \"a\",\n" +
" \"informationPublishDate\": null,\n" + " \"informationPublishDate\": \"span\",\n" +
" \"informationSource\": null,\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": null,\n" + " \"infoBlockPosition\": \"ul[class=\\\"commonList_dot\\\"]>li\",\n" +
" \"linkLocation\": null,\n" + " \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\": 0, \\\"name\\\": \\\"\\\", \\\"explain\\\": \\\"\\\", \\\"expression\\\": \\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": 2,\n" + " \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" + " \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" + " \"pageEnd\": 0,\n" +
" \"ynPageAll\": null,\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"0\",\n" + " \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>*.div[class=\\\"title1\\\"].h1[1]</exp></title>\",\n" + " \"detailExpressionTitle\": \"<title><exp>h1[class=\\\"main_content_title\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>*.div[class=\\\"infotxt\\\"]</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>div[class=\\\"szty\\\"]>span:contains(时间)</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" + " \"detailExpressionSource\": \"<origin><exp>div[class=\\\"szty\\\"]>span:contains(来源)</exp></origin>\",\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>*.div[class=\\\"bodybox\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>div[class=\\\"content\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\": 0, \\\"name\\\": \\\"\\\", \\\"explain\\\": \\\"\\\", \\\"expression\\\": \\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": null,\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\": 0, \\\"name\\\": \\\"\\\", \\\"explain\\\": \\\"\\\", \\\"mapping\\\": \\\"\\\", \\\"expression\\\": \\\"\\\", \\\"primaryKey\\\": \\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" + " \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": null,\n" + " \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" + " \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" + " \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" + " \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{}\",\n" + " \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" +
" \"ynDynamicCrawl\": 0,\n" + " \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" + " \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" + " \"domainName\": null,\n" +
" \"link\": null,\n" + " \"link\": null,\n" +
...@@ -81,8 +82,9 @@ public class ArticleCrawler { ...@@ -81,8 +82,9 @@ public class ArticleCrawler {
" \"crawlType\": 1,\n" + " \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": null,\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"41 48 0/8 * * ?\"\n" + " \"cron\": \"05 23 14 1/7 * ?\",\n" +
" \"ynSnapshot\": \"0\"\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(record, SiteMsgTemple.class);
ArticleCrawlerThread articleCrawlerThread=new ArticleCrawlerThread(); ArticleCrawlerThread articleCrawlerThread=new ArticleCrawlerThread();
......
...@@ -6,6 +6,7 @@ import com.zzsn.crawler.db.SnowIdUtils; ...@@ -6,6 +6,7 @@ import com.zzsn.crawler.db.SnowIdUtils;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.crawlerOther.StandardWebExtractorHandler; import com.zzsn.crawlerOther.StandardWebExtractorHandler;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
...@@ -25,11 +26,13 @@ import org.jsoup.nodes.Document; ...@@ -25,11 +26,13 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import java.io.InputStream;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
import java.sql.SQLException; import java.sql.SQLException;
import java.sql.Types; import java.sql.Types;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
...@@ -91,11 +94,7 @@ public class WebContentPaserByCss { ...@@ -91,11 +94,7 @@ public class WebContentPaserByCss {
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl="";
WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
webPageScreenShot.loadPage(uri_code,Constants.IMGPATH);
}
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
// return catchWebByMetaSearchList; // return catchWebByMetaSearchList;
...@@ -267,11 +266,7 @@ public class WebContentPaserByCss { ...@@ -267,11 +266,7 @@ public class WebContentPaserByCss {
} }
} }
} }
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl="";
WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
webPageScreenShot.loadPage(cwbm.getSourceaddress(),Constants.IMGPATH);
}
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId()); docInfo.setOrgId(cwbm.getOrgId());
......
...@@ -7,6 +7,7 @@ import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder; ...@@ -7,6 +7,7 @@ import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.CatchWebByMetaSearch;
...@@ -26,9 +27,11 @@ import org.springframework.kafka.core.KafkaTemplate; ...@@ -26,9 +27,11 @@ import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import javax.annotation.Resource; import javax.annotation.Resource;
import java.io.InputStream;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
...@@ -90,12 +93,7 @@ public class WebContentPaserByRegular { ...@@ -90,12 +93,7 @@ public class WebContentPaserByRegular {
TimeUnit.SECONDS.sleep(5); TimeUnit.SECONDS.sleep(5);
} }
} }
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl="";
WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
webPageScreenShot.loadPage(uri_code,Constants.IMGPATH);
}
//抽取资讯url //抽取资讯url
log.info("body的长度:"+body.length()); log.info("body的长度:"+body.length());
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
...@@ -278,15 +276,6 @@ public class WebContentPaserByRegular { ...@@ -278,15 +276,6 @@ public class WebContentPaserByRegular {
} }
} }
//使用浏览器截取图片 //使用浏览器截取图片
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl="";
WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
webPageScreenShot.loadPage(cwbm.getSourceaddress(),Constants.IMGPATH);
// InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
// HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
// imagUrl=map.get("objectUrl").toString();
}
log.info("详情内容的长度:"+content.length()); log.info("详情内容的长度:"+content.length());
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
...@@ -330,6 +319,15 @@ public class WebContentPaserByRegular { ...@@ -330,6 +319,15 @@ public class WebContentPaserByRegular {
log.info("资讯的信息不全缺少标题、时间或内容!:"+cwbm.getSourceaddress()); log.info("资讯的信息不全缺少标题、时间或内容!:"+cwbm.getSourceaddress());
continue; continue;
} }
if (StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")) {
String imagUrl = "";
// WebPageScreenShot webPageScreenShot = new WebPageScreenShot();
// webPageScreenShot.loadPage(cwbm.getSourceaddress(), Constants.IMGPATH);
InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
imagUrl=map.get("objectUrl").toString();
processitem.setScreenShotImg(imagUrl);
}
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
......
...@@ -7,6 +7,7 @@ import com.zzsn.crawler.PaserSiteDownload; ...@@ -7,6 +7,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.CatchWebByMetaSearch;
...@@ -44,16 +45,14 @@ import javax.xml.transform.dom.DOMSource; ...@@ -44,16 +45,14 @@ import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathExpressionException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter; import java.io.StringWriter;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
import java.security.KeyManagementException; import java.security.KeyManagementException;
import java.security.KeyStoreException; import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.util.ArrayList; import java.util.*;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -115,11 +114,6 @@ public class WebContentPaserByXpath { ...@@ -115,11 +114,6 @@ public class WebContentPaserByXpath {
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl="";
WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
webPageScreenShot.loadPage(uri_code,Constants.IMGPATH);
}
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
if(seleniumTime!=null) { if(seleniumTime!=null) {
...@@ -324,11 +318,7 @@ public class WebContentPaserByXpath { ...@@ -324,11 +318,7 @@ public class WebContentPaserByXpath {
} }
} }
} }
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl="";
WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
webPageScreenShot.loadPage(cwbm.getSourceaddress(),Constants.IMGPATH);
}
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId()); docInfo.setOrgId(cwbm.getOrgId());
...@@ -368,6 +358,15 @@ public class WebContentPaserByXpath { ...@@ -368,6 +358,15 @@ public class WebContentPaserByXpath {
}else{ }else{
processitem.setSource("1"); processitem.setSource("1");
} }
if (StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")) {
String imagUrl = "";
// WebPageScreenShot webPageScreenShot = new WebPageScreenShot();
// webPageScreenShot.loadPage(cwbm.getSourceaddress(), Constants.IMGPATH);
InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
imagUrl=map.get("objectUrl").toString();
processitem.setScreenShotImg(imagUrl);
}
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
......
...@@ -38,8 +38,9 @@ public class PageConnectioner { ...@@ -38,8 +38,9 @@ public class PageConnectioner {
HttpURLConnection connection = null; HttpURLConnection connection = null;
try { try {
url = new URL(urlstr); url = new URL(urlstr);
if (Constants.PROXYID==1) { if (Constants.PROXYID==5) {
String proxyIP = getProxyIP(); String proxyIP = getProxyIP();
System.out.println("代理IP :"+proxyIP);
String[] proxys=proxyIP.split("-"); String[] proxys=proxyIP.split("-");
String proxyHost = proxys[0]; String proxyHost = proxys[0];
int proxyPort = Integer.parseInt(proxys[1]); int proxyPort = Integer.parseInt(proxys[1]);
...@@ -88,7 +89,7 @@ public class PageConnectioner { ...@@ -88,7 +89,7 @@ public class PageConnectioner {
} }
} }
public static String getProxyIP(){ public static String getProxyIP(){
String searchSql = "select proxy from CIS_sys_Proxy where ID = 1"; String searchSql = "select proxy from CIS_sys_Proxy where ID = 4";
String proxy=""; String proxy="";
OracleDBManager dm = new OracleDBManager(); OracleDBManager dm = new OracleDBManager();
String[] coulmn = null; String[] coulmn = null;
...@@ -257,6 +258,7 @@ public class PageConnectioner { ...@@ -257,6 +258,7 @@ public class PageConnectioner {
url = new URL(urlstr); url = new URL(urlstr);
if (Constants.PROXYID==1) { if (Constants.PROXYID==1) {
String proxyIP = getProxyIP(); String proxyIP = getProxyIP();
System.out.println("代理IP :"+proxyIP);
String[] proxys=proxyIP.split("-"); String[] proxys=proxyIP.split("-");
String proxyHost = proxys[0]; String proxyHost = proxys[0];
int proxyPort = Integer.parseInt(proxys[1]); int proxyPort = Integer.parseInt(proxys[1]);
......
...@@ -94,4 +94,6 @@ public class ClbAnsProcessitem { ...@@ -94,4 +94,6 @@ public class ClbAnsProcessitem {
/**(临时处理)关联的专题id*/ /**(临时处理)关联的专题id*/
private List<String> subjectIds; private List<String> subjectIds;
/**网页图片快照地址*/
private String screenShotImg;
} }
...@@ -27,9 +27,11 @@ public class SiteMsgTemple implements Serializable { ...@@ -27,9 +27,11 @@ public class SiteMsgTemple implements Serializable {
/**历史数据URL*/ /**历史数据URL*/
private String hisUriExp; private String hisUriExp;
/**历史数据开始时间*/ /**历史数据开始时间*/
private java.util.Date hisDateStartTime; // private java.util.Date hisDateStartTime;
private String hisDateStartTime;
/**历史数据结束时间*/ /**历史数据结束时间*/
private java.util.Date hisDateEndTime; // private java.util.Date hisDateEndTime;
private String hisDateEndTime;
/**是否历史所有数据*/ /**是否历史所有数据*/
private String ynHisDataAll; private String ynHisDataAll;
/**网站级别*/ /**网站级别*/
...@@ -153,11 +155,15 @@ public class SiteMsgTemple implements Serializable { ...@@ -153,11 +155,15 @@ public class SiteMsgTemple implements Serializable {
/**cron表达式*/ /**cron表达式*/
private String cron; private String cron;
/**是否需要快照*/ // /**是否需要快照*/
private String isScreenshot; // private String isScreenshot;
//++++++++++++++++++++++++++++++++++++++++++++++++++ //++++++++++++++++++++++++++++++++++++++++++++++++++
private Pattern pattern; private Pattern pattern;
/**是否保存快照(1:保存 0:不保存)*/
private String ynSnapshot;
/**是否为验证方法(1:是验证 0:不是)*/
private String verifyType;
} }
...@@ -114,49 +114,5 @@ public class KafkaConsumerJob { ...@@ -114,49 +114,5 @@ public class KafkaConsumerJob {
} }
// @Scheduled(cron = "0 0/30 * * * ?")
@Async("asyncTaskExecutor")
public void runtimeTask (){
try {
Runtime mt = Runtime.getRuntime();
String cmd = "taskkill /F /im chromedriver.exe";
Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream();
pro.waitFor();
System.out.println("++++++++ taskkill /F /im chromedriver.exe");
} catch (IOException ioe) {
// ioe.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
}
// try {
// Runtime mt = Runtime.getRuntime();
// String cmd = "taskkill /F /im chrome.exe";
// Process pro = mt.exec(cmd);
// InputStream ers= pro.getErrorStream();
// pro.waitFor();
// } catch (IOException ioe) {
// ioe.printStackTrace();
// } catch (InterruptedException e) {
// // TODO Auto-generated catch block
// }
}
// @Scheduled(cron = "0 0/25 * * * ?")
@Async("asyncTaskExecutor")
public void runtimeTask2 (){
try {
Runtime mt = Runtime.getRuntime();
String cmd = "taskkill /F /im chrome.exe";
Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream();
pro.waitFor();
System.out.println("++++++++ taskkill /F /im chrome.exe");
} catch (IOException ioe) {
// ioe.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
}
}
} }
...@@ -108,7 +108,7 @@ public class ChromeTest { ...@@ -108,7 +108,7 @@ public class ChromeTest {
try { try {
// 测试打开bing // 测试打开bing
driver.get("https://www.bing.com"); driver.get("http://www.sgcc.com.cn/html/sgcc_main/col2017021879/column_2017021879_1.shtml");
// getTitle()获取当前页面title的值 // getTitle()获取当前页面title的值
System.out.println("当前打开页面的标题是: " + driver.getTitle()); System.out.println("当前打开页面的标题是: " + driver.getTitle());
......
package com.zzsn.test;
import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import java.io.IOException;
import java.net.MalformedURLException;
public class WebClientTest {
public static void main(String[] args) throws Exception{
String url="http://www.sgcc.com.cn/html/sgcc_main/col2017021879/column_2017021879_1.shtml";
String charset="utf-8";
String s = downloadByWebClient(url, charset);
System.out.println(s.length());
System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++");
System.out.println(s);
}
public static String downloadByWebClient(String urlstr,String charset) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setActiveXNative(true);
webClient.getOptions().setCssEnabled(true); //启用css
webClient.getOptions().setRedirectEnabled(true);//百度阅读暂改
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
webClient.getOptions().setTimeout(20000);//设置“浏览器”的请求超时时间
webClient.setJavaScriptTimeout(30000);//设置JS执行的超时时间
String pageStr = null;
try {
Page page = webClient.getPage(urlstr);
if (page instanceof HtmlPage) {
HtmlPage htmlPage = (HtmlPage) page;
webClient.waitForBackgroundJavaScript(5000); //阻塞线程
pageStr = htmlPage.asXml();
}
} catch (Exception e) {
}
webClient.close();
return pageStr;
}
}
package com.zzsn.util; package com.zzsn.util;
import com.zzsn.crawler.paser.SubtractionTag;
import com.zzsn.crawler.paser.WebContentPaserByRegular;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.download.PageBuilderParser;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.time.LocalDate; import java.time.LocalDate;
...@@ -348,6 +357,8 @@ public class PublishDateUtil { ...@@ -348,6 +357,8 @@ public class PublishDateUtil {
SimpleDateFormat sdf11 = new SimpleDateFormat("MMM dd, yyyy", Locale.CHINA); SimpleDateFormat sdf11 = new SimpleDateFormat("MMM dd, yyyy", Locale.CHINA);
SimpleDateFormat sdf12 = new SimpleDateFormat("MMM. d, yyyy", Locale.ENGLISH); SimpleDateFormat sdf12 = new SimpleDateFormat("MMM. d, yyyy", Locale.ENGLISH);
SimpleDateFormat sdf13 = new SimpleDateFormat("dd. MMMM yyyy",Locale.ENGLISH); SimpleDateFormat sdf13 = new SimpleDateFormat("dd. MMMM yyyy",Locale.ENGLISH);
SimpleDateFormat sdf14 = new SimpleDateFormat("dd MMM yyyy",Locale.ITALY);
SimpleDateFormat sdf15 = new SimpleDateFormat("dd MMM. yyyy",Locale.ENGLISH);
try { try {
date = sdf1.parse(dateStr); date = sdf1.parse(dateStr);
...@@ -391,8 +402,17 @@ public class PublishDateUtil { ...@@ -391,8 +402,17 @@ public class PublishDateUtil {
try { try {
date = sdf13.parse(dateStr); date = sdf13.parse(dateStr);
} catch (ParseException e11) { } catch (ParseException e11) {
try {
date = sdf14.parse(dateStr);
} catch (ParseException e12) {
e11.printStackTrace(); try {
date = sdf15.parse(dateStr);
} catch (ParseException e13) {
date=new Date();
// e13.printStackTrace();
}
}
} }
} }
} }
...@@ -439,12 +459,96 @@ public class PublishDateUtil { ...@@ -439,12 +459,96 @@ public class PublishDateUtil {
// Date str1= new SimpleDateFormat("yyyy, MM月 dd").parse(str); // Date str1= new SimpleDateFormat("yyyy, MM月 dd").parse(str);
// System.out.println(str1); // System.out.println(str1);
// } // }
String str1= PublishDateUtil.getPublishDate("Tuesday, 14 June 2022"); String str1= PublishDateUtil.getPublishDate("30 Jun. 2022 ");
// String str1= PublishDateUtil.getPublishDate("星期一, 2021, 11月 8 - 17:23"); // String str1= PublishDateUtil.getPublishDate("星期一, 2021, 11月 8 - 17:23");
//String str2= PublishDateUtil.getPublishDate("2021 年 11 月 23 日"); //String str2= PublishDateUtil.getPublishDate("2021 年 11 月 23 日");
// System.out.println(new SimpleDateFormat("MMM dd, yyyy", Locale.CHINA).parse("十二月 15, 2021")); // System.out.println(new SimpleDateFormat("MMM dd, yyyy", Locale.CHINA).parse("十二月 15, 2021"));
// SimpleDateFormat sdf1 = new SimpleDateFormat("dd MMM yyyy",Locale.ITALY);
System.out.println(str1); System.out.println(str1);
// String aa="";
// String s = paseElementByCSS(Jsoup.parse(aa), "<publish_date><exp>*.div[class=\"news-date\"].time</exp><subtraction>span</subtraction></publish_date>");
}
public static String paseElementByCSS(Document doc, String xmlTag){
String tag="";
String subtraction ="";
String attr="";
try {
PageBuilderParser pageBuilderParser = new PageBuilderParser();
org.w3c.dom.Document document = HtmlPageParser.xmlGetDocument(xmlTag);
tag = pageBuilderParser.parserStr(document, "//exp");
if(StringUtils.contains(xmlTag,"subtraction")){
subtraction = pageBuilderParser.parserStr(document, "//subtraction");
}
if(StringUtils.contains(xmlTag,"attr")){
attr = pageBuilderParser.parserStr(document, "//attr");
}
}catch (Exception e){
}
String msg="";
try {
SubtractionTag subtractionTag=new SubtractionTag();
String [] removeTags=subtraction==""? null : subtraction.split(",");
String subHtml = subtractionTag.extract(doc, removeTags);
doc = Jsoup.parse(subHtml);
// Document document = Jsoup.parse(subHtml);
tag=tag.replace("*.","");
tag=tag.replace(".",">");
String index="";
String matchUrl="\\[\\d+\\]";
Pattern pattern = Pattern.compile(matchUrl);
String[] tagss=tag.split(">");
String tagMsg="";
for (int i = 0; i < tagss.length; i++) {
Matcher matcher = pattern.matcher(tagss[i]);
String aa="";
if(matcher.find()){
index=matcher.group();
int idx=0;
if(StringUtils.isNotEmpty(index)) {
idx = Integer.parseInt(index.replace("[", "").replace("]", ""));
if(idx>0) {
idx = idx - 1;
// index = "[" + idx + "]";
}
}
tagss[i]=tagss[i].replace(index,"");
tagss[i]=tagss[i]+":eq("+idx+")";
}
tagMsg+=tagss[i]+">";
}
tagMsg=tagMsg.substring(0,tagMsg.lastIndexOf(">"));
Elements elements =null;
if(tag.contains("|")){
String[] tags=tag.split("\\|");
for (int i = 0; i < tags.length ; i++) {
elements = doc.select(tags[i].replace("*.","").replace(".",">"));
if(elements.size()>0){
break;
}
}
}else {
// tagMsg
elements = doc.select(tagMsg);
// elements = doc.select(tag);
}
int idx=0;
// if(StringUtils.isNotEmpty(index)){
// idx=Integer.parseInt(index.replace("[","").replace("]",""));
// idx=idx-1;
// }
if (elements.size() > 0 && StringUtils.isNotEmpty(attr.replace("*.",""))) {
msg = elements.get(idx).attr(attr).trim();
}else if(elements.size() > 0){
msg = elements.get(idx).text().trim();
}
}catch (Exception e){
e.printStackTrace();
}finally {
return msg;
}
} }
} }
...@@ -33,7 +33,9 @@ public class WindowsProcess { ...@@ -33,7 +33,9 @@ public class WindowsProcess {
* @date 2022/7/26 11:23 * @date 2022/7/26 11:23
*/ */
// @Scheduled(cron = "0 0 1 * * ?") // @Scheduled(cron = "0 0 1 * * ?")
@Scheduled(cron = "0 0 0/4 * * ? ")
private void killProcess() { private void killProcess() {
log.info("定时关闭浏览器并重新打开");
try { try {
String line; String line;
Process p = Runtime.getRuntime().exec("tasklist.exe"); Process p = Runtime.getRuntime().exec("tasklist.exe");
......
...@@ -95,7 +95,7 @@ HUAWEICLOUD_SK= heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY ...@@ -95,7 +95,7 @@ HUAWEICLOUD_SK= heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY
#IMGPATH= E:\\chrome\\img\\shot.png #IMGPATH= E:\\chrome\\img\\shot.png
IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\aa.txt IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\aa.txt
selenium.driver.cache=comm_selenium_driver_cache_1 selenium.driver.cache=selenium_driver_cache_loc112
......
# Redis settings # Redis settings
redis.host=114.115.236.206 #redis.host=114.115.236.206
redis.port=6379
redis.pass=clbzzsn
redis.timeout=10000
#redis.host=127.0.0.1
#redis.port=6379 #redis.port=6379
#redis.pass=xxxxxx #redis.pass=clbzzsn
#redis.timeout=10000 #redis.timeout=10000
redis.host=127.0.0.1
redis.port=6379
redis.pass=xxxxxx
redis.timeout=10000
redis.maxIdle=300 redis.maxIdle=300
redis.maxTotal=600 redis.maxTotal=600
......
Global Development Initiative ATCC
GDI \ No newline at end of file
\ No newline at end of file
...@@ -337,7 +337,7 @@ ...@@ -337,7 +337,7 @@
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
<artifactId>druid</artifactId> <artifactId>druid</artifactId>
<version>1.0.5</version> <version>1.1.10</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
......
package com.zzsn; package com.zzsn;
import com.google.gson.Gson;
import com.zzsn.search.MetaGoogleSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder; import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.ServletComponentScan; import org.springframework.boot.web.servlet.ServletComponentScan;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer; import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
@Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn") @SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerStaticApplication extends SpringBootServletInitializer { public class CrawlerStaticApplication extends SpringBootServletInitializer implements CommandLineRunner {
@Override @Override
protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) { protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
return builder.sources(CrawlerStaticApplication.class); return builder.sources(CrawlerStaticApplication.class);
...@@ -15,5 +34,80 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer { ...@@ -15,5 +34,80 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer {
public static void main(String[] args) { public static void main(String[] args) {
SpringApplication.run(CrawlerStaticApplication.class, args); SpringApplication.run(CrawlerStaticApplication.class, args);
} }
@Override
public void run(String... args) throws Exception {
// System.out.println("——————++++++++++++——————===");
// String key="{\n" +
// " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.KeyWordsDTO\",\n" +
// " \"id\": \"1513415223147425794\",\n" +
// " \"wordsCode\": \"KW-20220411-0003\",\n" +
// " \"wordsName\": \"链长\",\n" +
// " \"keyWord\": \"链长\",\n" +
// " \"exclusionWord\": null,\n" +
// " \"status\": \"1\",\n" +
// " \"subjectId\": null,\n" +
// " \"subjectIds\": null,\n" +
// " \"startTime\": \"1607443200000\",\n" +
// " \"endTime\": null\n" +
// "}";
// KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class);
// MetaSouGouSearchThread metaSearchThread=new MetaSouGouSearchThread();
// metaSearchThread.keywordMsg=keywordMsg;
// metaSearchThread.crawler();
consumerKeyword ();
}
public void consumerKeyword (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
List<String> searchEngines = keywordMsg.getSearchEngines();
if(searchEngines!=null && searchEngines.contains("4")) {
MetaGoogleSearchThread metaSearchThread = new MetaGoogleSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
} }
\ No newline at end of file
...@@ -46,7 +46,7 @@ public class KafkaConsumerGoogleTask { ...@@ -46,7 +46,7 @@ public class KafkaConsumerGoogleTask {
} }
//打包编译时修改定时启动的任务 如果是搜索关键词放开consumerKeyword 如果是内容解析放开consumerDetailUrl 上面的定时任务 //打包编译时修改定时启动的任务 如果是搜索关键词放开consumerKeyword 如果是内容解析放开consumerDetailUrl 上面的定时任务
@Scheduled(cron = "0 0/2 * * * ?") // @Scheduled(cron = "0 0/2 * * * ?")
@Async("webExecutor") @Async("webExecutor")
public void consumerKeyword (){ public void consumerKeyword (){
log.info("定时获取mq消息"); log.info("定时获取mq消息");
......
...@@ -25,8 +25,8 @@ public class WebGoogleSearch { ...@@ -25,8 +25,8 @@ public class WebGoogleSearch {
// String filepath=args[0]; // String filepath=args[0];
String filepath= Constants.META_SEARCH_KEYWORDPATH; String filepath= Constants.META_SEARCH_KEYWORDPATH;
String startTime="2021-09-01"; String startTime="2019-06-01";
String endTime="2022-07-01"; String endTime="2022-08-04";
startTime=dateToStamp(startTime); startTime=dateToStamp(startTime);
endTime=dateToStamp(endTime); endTime=dateToStamp(endTime);
File f = new File(filepath); File f = new File(filepath);
...@@ -60,7 +60,7 @@ public class WebGoogleSearch { ...@@ -60,7 +60,7 @@ public class WebGoogleSearch {
webGoogleSearchThread.setStartTime(startTime); webGoogleSearchThread.setStartTime(startTime);
webGoogleSearchThread.setEndTime(endTime); webGoogleSearchThread.setEndTime(endTime);
KeywordMsg keywordMsg=new KeywordMsg(); KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("2020070101"); keywordMsg.setId("2022080401");
keywordMsg.setStartTime(Long.parseLong(startTime)); keywordMsg.setStartTime(Long.parseLong(startTime));
keywordMsg.setEndTime(Long.parseLong(endTime)); keywordMsg.setEndTime(Long.parseLong(endTime));
......
...@@ -366,7 +366,7 @@ public class WebGoogleSearchThread implements Runnable { ...@@ -366,7 +366,7 @@ public class WebGoogleSearchThread implements Runnable {
log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+ log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
"|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+"")); "|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));
// intsertData(docInfo); intsertData(docInfo);
//信息转换 //信息转换
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
......
...@@ -416,7 +416,7 @@ public class MetaGoogleSearchThread implements Runnable { ...@@ -416,7 +416,7 @@ public class MetaGoogleSearchThread implements Runnable {
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
clbAnsProcessitem.setPublishDate(docInfo.getPublishDate()); clbAnsProcessitem.setPublishDate(docInfo.getPublishDate());
clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress()); clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress());
clbAnsProcessitem.setSource("4");
return clbAnsProcessitem; return clbAnsProcessitem;
} }
//转换qq新闻链接 //转换qq新闻链接
......
...@@ -29,5 +29,9 @@ public class KeywordMsg { ...@@ -29,5 +29,9 @@ public class KeywordMsg {
private Long startTime; private Long startTime;
private Long endTime; private Long endTime;
//需要启动的信息采集器
private List<String> searchEngines;
//采集的要求(1:标题 2:正文 3:全文)
private String crawlerType;
} }
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -27,19 +27,19 @@ public class Data2OracleJob { ...@@ -27,19 +27,19 @@ public class Data2OracleJob {
public CisAnsBasedataServiceImpl cisAnsBasedataService= SpringContextUtil.getBean(CisAnsBasedataServiceImpl.class); public CisAnsBasedataServiceImpl cisAnsBasedataService= SpringContextUtil.getBean(CisAnsBasedataServiceImpl.class);
// @Scheduled(cron = "0 0 7 * * ?") // @Scheduled(cron = "0 0 7 * * ?")
@Scheduled(cron = "0 0/6 * * * ?") @Scheduled(cron = "0 0/5 * * * ?")
public void consumer() { public void consumer() {
try { try {
String subjectId = "1536238367948042241"; String subjectId = "1531517573120557057";
String mid = ""; String mid = "81688199865196552";
try { // try {
String basedata = JedisUtil.getString("basedata"); // String basedata = JedisUtil.getString("basedata");
mid = basedata; // mid = basedata;
} catch (Exception e) { // } catch (Exception e) {
//
} // }
if (StringUtils.isEmpty(mid)) { if (StringUtils.isEmpty(mid)) {
mid = "0"; mid = "81688199865196552";
} }
String apiPath = "http://114.115.236.206:9988/datapull/thirdParty/queryDataList?id=[id]&subjectId=[subjectId]&pageNo=1"; String apiPath = "http://114.115.236.206:9988/datapull/thirdParty/queryDataList?id=[id]&subjectId=[subjectId]&pageNo=1";
String apiPath2 = apiPath.replace("[id]", mid).replace("[subjectId]", subjectId); String apiPath2 = apiPath.replace("[id]", mid).replace("[subjectId]", subjectId);
......
...@@ -46,7 +46,7 @@ public class CisAnsBasedataServiceImpl extends ServiceImpl<CisAnsBasedataMapper, ...@@ -46,7 +46,7 @@ public class CisAnsBasedataServiceImpl extends ServiceImpl<CisAnsBasedataMapper,
return Long.valueOf(map.get("serialNo")); return Long.valueOf(map.get("serialNo"));
}else }else
{ {
return null; return 0l;
} }
} }
...@@ -71,24 +71,29 @@ public class CisAnsBasedataServiceImpl extends ServiceImpl<CisAnsBasedataMapper, ...@@ -71,24 +71,29 @@ public class CisAnsBasedataServiceImpl extends ServiceImpl<CisAnsBasedataMapper,
public void saveWSBMsg(String apiPath) { public void saveWSBMsg(String apiPath) {
List<CisAnsBasedata> cisAnsBasedataList = httpRequest(apiPath); List<CisAnsBasedata> cisAnsBasedataList = httpRequest(apiPath);
for (CisAnsBasedata cisAnsBasedata: cisAnsBasedataList) { for (CisAnsBasedata cisAnsBasedata: cisAnsBasedataList) {
Long idl=getserialno()+Long.parseLong(getBasedataCountid());
cisAnsBasedata.setId(idl+"");
cisAnsBasedataMapper.saveCis(cisAnsBasedata);
//插入baseDatatype
Map<String,String> basedataType=new HashMap<>();
basedataType.put("id",getBaseDataTypeCountid());
basedataType.put("bid",idl+"");
basedataType.put("tid","21216");
basedataType.put("orgId","2401");
basedataType.put("publishDate",cisAnsBasedata.getCreateDate());
basedataType.put("createDate ",cisAnsBasedata.getCreateDate());
basedataType.put("relevance","0");
cisAnsBasedataMapper.saveBaseDataType(basedataType);
System.out.println("插入成功");
try { try {
Thread.sleep(1000); Long idl = getserialno() + Long.parseLong(getBasedataCountid());
}catch (Exception e){ cisAnsBasedata.setId(idl + "");
cisAnsBasedataMapper.saveCis(cisAnsBasedata);
//插入baseDatatype
Map<String, String> basedataType = new HashMap<>();
System.out.println("getBaseDataTypeCountid()" + getBaseDataTypeCountid());
basedataType.put("id", getBaseDataTypeCountid());
basedataType.put("bid", idl + "");
basedataType.put("tid", "18485");
basedataType.put("orgId", "3642");
basedataType.put("publishDate", cisAnsBasedata.getCreateDate());
basedataType.put("createDate ", cisAnsBasedata.getCreateDate());
basedataType.put("relevance", "0");
cisAnsBasedataMapper.saveBaseDataType(basedataType);
System.out.println("插入成功");
try {
Thread.sleep(2000);
} catch (Exception e) {
}
}catch (Exception e){
continue;
} }
} }
...@@ -140,26 +145,26 @@ public class CisAnsBasedataServiceImpl extends ServiceImpl<CisAnsBasedataMapper, ...@@ -140,26 +145,26 @@ public class CisAnsBasedataServiceImpl extends ServiceImpl<CisAnsBasedataMapper,
Date pDate=sdf.parse(publishDate.replace("T"," "));//按以上格式 将当前时间转换成字符串 Date pDate=sdf.parse(publishDate.replace("T"," "));//按以上格式 将当前时间转换成字符串
long diff = new Date().getTime() - pDate.getTime(); long diff = new Date().getTime() - pDate.getTime();
long day = 1000 * 60 * 60 * 24; long day = 1000 * 60 * 60 * 24;
if(diff/day>1){ // if(diff/day>1){
continue; // continue;
} // }
String format = sdf.format(pDate); String format = sdf.format(pDate);
CisAnsBasedata cisAnsBasedata=new CisAnsBasedata(); CisAnsBasedata cisAnsBasedata=new CisAnsBasedata();
cisAnsBasedata.setSid(sid); cisAnsBasedata.setSid(sid);
cisAnsBasedata.setTitle(title); cisAnsBasedata.setTitle(title);
cisAnsBasedata.setSummary(summary); cisAnsBasedata.setSummary(summary);
cisAnsBasedata.setAuthor(author); cisAnsBasedata.setAuthor(author);
cisAnsBasedata.setSourcesite("国家移民管理局"); cisAnsBasedata.setSourcesite("中国法院网");
cisAnsBasedata.setSourceaddress(sourceAddress); cisAnsBasedata.setSourceaddress(sourceAddress);
cisAnsBasedata.setType("HTML"); cisAnsBasedata.setType("HTML");
cisAnsBasedata.setPublishDate(publishDate.replace("T"," ")); cisAnsBasedata.setPublishDate(publishDate.replace("T"," "));
cisAnsBasedata.setCreateDate(publishDate.replace("T"," ")); cisAnsBasedata.setCreateDate(createDate.replace("T"," "));
cisAnsBasedata.setContent(contentWithTag); cisAnsBasedata.setContent(contentWithTag);
cisAnsBasedata.setContentNoTag(content); cisAnsBasedata.setContentNoTag(content);
cisAnsBasedata.setContentImgCvtTag(contentWithTag); cisAnsBasedata.setContentImgCvtTag(contentWithTag);
cisAnsBasedata.setLang("zh_CN"); cisAnsBasedata.setLang("zh_CN");
cisAnsBasedata.setOrigin(origin); cisAnsBasedata.setOrigin(origin);
cisAnsBasedata.setOrgId("2401"); cisAnsBasedata.setOrgId("3642");
cisAnsBasedata.setSourceType("News"); cisAnsBasedata.setSourceType("News");
cisAnsBasedata.setFromWhere("Python"); cisAnsBasedata.setFromWhere("Python");
cisAnsBasedataList.add(cisAnsBasedata); cisAnsBasedataList.add(cisAnsBasedata);
......
...@@ -338,7 +338,7 @@ ...@@ -338,7 +338,7 @@
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
<artifactId>druid</artifactId> <artifactId>druid</artifactId>
<version>1.0.5</version> <version>1.1.10</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
......
...@@ -24,23 +24,23 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -24,23 +24,23 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
public void run(String... args) throws Exception { public void run(String... args) throws Exception {
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String key="{\n" + // String key="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.KeyWordsDTO\",\n" + // " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.KeyWordsDTO\",\n" +
" \"id\": \"1513415223147425794\",\n" + // " \"id\": \"1513415223147425794\",\n" +
" \"wordsCode\": \"KW-20220411-0003\",\n" + // " \"wordsCode\": \"KW-20220411-0003\",\n" +
" \"wordsName\": \"链长\",\n" + // " \"wordsName\": \"链长\",\n" +
" \"keyWord\": \"链长\",\n" + // " \"keyWord\": \"链长\",\n" +
" \"exclusionWord\": null,\n" + // " \"exclusionWord\": null,\n" +
" \"status\": \"1\",\n" + // " \"status\": \"1\",\n" +
" \"subjectId\": null,\n" + // " \"subjectId\": null,\n" +
" \"subjectIds\": null,\n" + // " \"subjectIds\": null,\n" +
" \"startTime\": \"1607443200000\",\n" + // " \"startTime\": \"1607443200000\",\n" +
" \"endTime\": null\n" + // " \"endTime\": null\n" +
"}"; // "}";
KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class); // KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class);
MetaSoSearchThread metaSearchThread=new MetaSoSearchThread(); // MetaSoSearchThread metaSearchThread=new MetaSoSearchThread();
metaSearchThread.keywordMsg=keywordMsg; // metaSearchThread.keywordMsg=keywordMsg;
metaSearchThread.crawler(); // metaSearchThread.crawler();
} }
} }
\ No newline at end of file
...@@ -29,5 +29,9 @@ public class KeywordMsg { ...@@ -29,5 +29,9 @@ public class KeywordMsg {
private Long startTime; private Long startTime;
private Long endTime; private Long endTime;
//需要启动的信息采集器
private List<String> searchEngines;
//采集的要求(1:标题 2:正文 3:全文)
private String crawlerType;
} }
...@@ -356,7 +356,7 @@ public class DetailSoSearchThread implements Runnable { ...@@ -356,7 +356,7 @@ public class DetailSoSearchThread implements Runnable {
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
clbAnsProcessitem.setPublishDate(docInfo.getPublishDate()); clbAnsProcessitem.setPublishDate(docInfo.getPublishDate());
clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress()); clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress());
clbAnsProcessitem.setSource("360搜索"); clbAnsProcessitem.setSource("6");
return clbAnsProcessitem; return clbAnsProcessitem;
} }
//转换qq新闻链接 //转换qq新闻链接
......
...@@ -337,7 +337,7 @@ ...@@ -337,7 +337,7 @@
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
<artifactId>druid</artifactId> <artifactId>druid</artifactId>
<version>1.0.5</version> <version>1.1.10</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
......
package com.zzsn; package com.zzsn;
import com.google.gson.Gson;
import com.zzsn.job.KafkaConsumerSougouTask;
import com.zzsn.search.MetaSouGouSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.boot.CommandLineRunner; import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
...@@ -7,6 +20,11 @@ import org.springframework.boot.builder.SpringApplicationBuilder; ...@@ -7,6 +20,11 @@ import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.ServletComponentScan; import org.springframework.boot.web.servlet.ServletComponentScan;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer; import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Properties;
@Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn") @SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerStaticApplication extends SpringBootServletInitializer implements CommandLineRunner { public class CrawlerStaticApplication extends SpringBootServletInitializer implements CommandLineRunner {
@Override @Override
...@@ -38,6 +56,56 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -38,6 +56,56 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
// MetaSouGouSearchThread metaSearchThread=new MetaSouGouSearchThread(); // MetaSouGouSearchThread metaSearchThread=new MetaSouGouSearchThread();
// metaSearchThread.keywordMsg=keywordMsg; // metaSearchThread.keywordMsg=keywordMsg;
// metaSearchThread.crawler(); // metaSearchThread.crawler();
consumerKeyword ();
}
public void consumerKeyword (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
MetaSouGouSearchThread metaSearchThread=new MetaSouGouSearchThread();
metaSearchThread.keywordMsg=keywordMsg;
metaSearchThread.crawler();
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
} }
} }
\ No newline at end of file
...@@ -50,7 +50,7 @@ public class KafkaConsumerSougouTask { ...@@ -50,7 +50,7 @@ public class KafkaConsumerSougouTask {
} }
//打包编译时修改定时启动的任务 如果是搜索关键词放开consumerKeyword 如果是内容解析放开consumerDetailUrl 上面的定时任务 //打包编译时修改定时启动的任务 如果是搜索关键词放开consumerKeyword 如果是内容解析放开consumerDetailUrl 上面的定时任务
@Scheduled(cron = "0 0/5 * * * ?") // @Scheduled(cron = "0 0/5 * * * ?")
@Async("webExecutor") @Async("webExecutor")
public void consumerKeyword (){ public void consumerKeyword (){
log.info("定时获取mq消息"); log.info("定时获取mq消息");
......
...@@ -29,5 +29,9 @@ public class KeywordMsg { ...@@ -29,5 +29,9 @@ public class KeywordMsg {
private Long startTime; private Long startTime;
private Long endTime; private Long endTime;
//需要启动的信息采集器
private List<String> searchEngines;
//采集的要求(1:标题 2:正文 3:全文)
private String crawlerType;
} }
...@@ -318,7 +318,7 @@ public class DetailSouGouSearchThread implements Runnable { ...@@ -318,7 +318,7 @@ public class DetailSouGouSearchThread implements Runnable {
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
clbAnsProcessitem.setPublishDate(docInfo.getPublishDate()); clbAnsProcessitem.setPublishDate(docInfo.getPublishDate());
clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress()); clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress());
clbAnsProcessitem.setSource("搜狗搜索"); clbAnsProcessitem.setSource("5");
return clbAnsProcessitem; return clbAnsProcessitem;
} }
//转换qq新闻链接 //转换qq新闻链接
......
...@@ -288,9 +288,9 @@ public class GetPostTest { ...@@ -288,9 +288,9 @@ public class GetPostTest {
// //String param = ""; // //String param = "";
// String sendRecvPost =GetPostTest.sendPost("http://flights.sichuanair.com/3uair/ibe/common/processSearchForm.do",param); // String sendRecvPost =GetPostTest.sendPost("http://flights.sichuanair.com/3uair/ibe/common/processSearchForm.do",param);
// System.out.println(sendRecvPost); // System.out.println(sendRecvPost);
String s = GetPostTest.sendGet("https://news.search.yahoo.com/search?p=camila+cabello&ei=UTF-8&b=11&pz=10&bct=0&xargs=0"); // String s = GetPostTest.sendGet("https://news.search.yahoo.com/search?p=camila+cabello&ei=UTF-8&b=11&pz=10&bct=0&xargs=0");
System.out.println("++++++++++++++"); // System.out.println("++++++++++++++");
System.out.println(s); // System.out.println(s);
// String s = GetPostTest.sendGet2("https://r.search.yahoo.com/cbclk2/dWU9Rjc1RjcxOTJGRTNDNDIxNCZ1dD0xNjUyNzU2NzU5NzgwJnVvPTcxNjc0NTA5NzMwMTM0Jmx0PTImcz0xJmVzPWlFQU4uX3dHUFMuMDhuTWZfRzF0UWs1c0NEcGRFX3JDOVJZRTVhYTk2a19NMWpRLQ--/RV=2/RE=1652785559/RO=10/RU=https%3a%2f%2fwww.bing.com%2faclick%3fld%3de8fQ3skeVyCmtdkXvJyer7lDVUCUyhmNmibXcU1g8Hhd8ZTP9yVbfAq5pNRMZ_6fHx8vQlszU4iyBnCrBPhJkkzdfcQjSK1zSIRvAuckKG1yLbotUiY-nz2v9wutjIGsfPzL1L_eSvC27hY1k4hD4Qxo_Rmg_G_1Q1KbJPdoZKtAY13AUSbO7IqFUetTg2hQpVKAISYw%26u%3daHR0cCUzYSUyZiUyZnd3dy5hdXRvbWV0YWxkaXJlY3QuY29tJTJmaW5kZXgucGhwJTNmbXNjbGtpZCUzZDJiNDZkMDcwOTQ1YzE2YjBiMmNkYjJmMzg1NGI5ZDkzJTI2dXRtX3NvdXJjZSUzZGJpbmclMjZ1dG1fbWVkaXVtJTNkY3BjJTI2dXRtX2NhbXBhaWduJTNkU0MlMjUyMC0lMjUyMFRyYWRlbWFyayUyNTIwQnJhbmRlZCUyNTIwLSUyNTIwRXhhY3QlMjZ1dG1fdGVybSUzZGh0dHAlMjUyMHd3dyUyNTIwJTI1MkJhbWQlMjUyMGNvbSUyNnV0bV9jb250ZW50JTNkQU1EJTI1MjBQYXJ0cw%26rlid%3d2b46d070945c16b0b2cdb2f3854b9d93/RK=2/RS=oav4rOCFRVacozG5vDOB2Ug99Ik-;_ylt=AwrXnCIXEYNig1EAyiTQtDMD;_ylu=Y29sbwNncTEEcG9zAzIEdnRpZAMEc2VjA292LXRvcA--?IG=0ad79c22e5134c6b9000000000e0df0b"); // String s = GetPostTest.sendGet2("https://r.search.yahoo.com/cbclk2/dWU9Rjc1RjcxOTJGRTNDNDIxNCZ1dD0xNjUyNzU2NzU5NzgwJnVvPTcxNjc0NTA5NzMwMTM0Jmx0PTImcz0xJmVzPWlFQU4uX3dHUFMuMDhuTWZfRzF0UWs1c0NEcGRFX3JDOVJZRTVhYTk2a19NMWpRLQ--/RV=2/RE=1652785559/RO=10/RU=https%3a%2f%2fwww.bing.com%2faclick%3fld%3de8fQ3skeVyCmtdkXvJyer7lDVUCUyhmNmibXcU1g8Hhd8ZTP9yVbfAq5pNRMZ_6fHx8vQlszU4iyBnCrBPhJkkzdfcQjSK1zSIRvAuckKG1yLbotUiY-nz2v9wutjIGsfPzL1L_eSvC27hY1k4hD4Qxo_Rmg_G_1Q1KbJPdoZKtAY13AUSbO7IqFUetTg2hQpVKAISYw%26u%3daHR0cCUzYSUyZiUyZnd3dy5hdXRvbWV0YWxkaXJlY3QuY29tJTJmaW5kZXgucGhwJTNmbXNjbGtpZCUzZDJiNDZkMDcwOTQ1YzE2YjBiMmNkYjJmMzg1NGI5ZDkzJTI2dXRtX3NvdXJjZSUzZGJpbmclMjZ1dG1fbWVkaXVtJTNkY3BjJTI2dXRtX2NhbXBhaWduJTNkU0MlMjUyMC0lMjUyMFRyYWRlbWFyayUyNTIwQnJhbmRlZCUyNTIwLSUyNTIwRXhhY3QlMjZ1dG1fdGVybSUzZGh0dHAlMjUyMHd3dyUyNTIwJTI1MkJhbWQlMjUyMGNvbSUyNnV0bV9jb250ZW50JTNkQU1EJTI1MjBQYXJ0cw%26rlid%3d2b46d070945c16b0b2cdb2f3854b9d93/RK=2/RS=oav4rOCFRVacozG5vDOB2Ug99Ik-;_ylt=AwrXnCIXEYNig1EAyiTQtDMD;_ylu=Y29sbwNncTEEcG9zAzIEdnRpZAMEc2VjA292LXRvcA--?IG=0ad79c22e5134c6b9000000000e0df0b");
// System.out.println("++++++++++++++"); // System.out.println("++++++++++++++");
// System.out.println(s); // System.out.println(s);
...@@ -303,8 +303,9 @@ public class GetPostTest { ...@@ -303,8 +303,9 @@ public class GetPostTest {
// } // }
// System.out.println(buffer.toString()); // System.out.println(buffer.toString());
// Map<String, String> cookie2 = getCookie2("https://www.sogou.com/sogou?interation=1728053249&query=intitle:Intel%20%E5%88%9B%E6%96%B0&tsn=0&page=2&ie=utf8&dp=1"); Map<String, String> cookie2 = getCookie2("https://weixin.sogou.com/weixin?type=1&s_from=input&query=%E8%A7%82%E7%82%B9%E4%B8%AD%E5%9B%BD&ie=utf8&_sug_=n&_sug_type_=");
// System.out.println(cookie2);
System.out.println(cookie2);
} }
} }
...@@ -21,7 +21,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 ...@@ -21,7 +21,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#消费主题 #消费主题
KAFKA_CONSUMER_TOPIC=keyWordsInfo KAFKA_CONSUMER_TOPIC=keyWordsInfo
#消费者 #消费者
KAFKA_CONSUMER_GROUP_ID=so-sync KAFKA_CONSUMER_GROUP_ID=sougou-sync
#kafka消费信息模式 #kafka消费信息模式
KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
#信息发送主题 #信息发送主题
...@@ -35,8 +35,8 @@ KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC=sougou_crawler_urlList ...@@ -35,8 +35,8 @@ KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC=sougou_crawler_urlList
#百度信息链接发送主题 #百度信息链接发送主题
#KAFKA_PRODUCT_PASERURL_TOPIC=baidu_crawler_paserurl #KAFKA_PRODUCT_PASERURL_TOPIC=baidu_crawler_paserurl
#指定分区使用逗号分割 #指定分区使用逗号分割
#KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
KAFKA_CONSUMER_PARTITION=0 #KAFKA_CONSUMER_PARTITION=0
KAFKA_PRODUCT_PARTITION=0 KAFKA_PRODUCT_PARTITION=0
#搜索地址 #搜索地址
......
...@@ -29,5 +29,9 @@ public class KeywordMsg { ...@@ -29,5 +29,9 @@ public class KeywordMsg {
private Long startTime; private Long startTime;
private Long endTime; private Long endTime;
//需要启动的信息采集器
private List<String> searchEngines;
//采集的要求(1:标题 2:正文 3:全文)
private String crawlerType;
} }
...@@ -337,7 +337,7 @@ ...@@ -337,7 +337,7 @@
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
<artifactId>druid</artifactId> <artifactId>druid</artifactId>
<version>1.0.5</version> <version>1.1.10</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
......
package com.zzsn; package com.zzsn;
import com.google.gson.Gson;
import com.zzsn.search.MetaYahooSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder; import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer; import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
@Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn") @SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerStaticApplication extends SpringBootServletInitializer { public class CrawlerStaticApplication extends SpringBootServletInitializer implements CommandLineRunner {
@Override @Override
protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) { protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
return builder.sources(CrawlerStaticApplication.class); return builder.sources(CrawlerStaticApplication.class);
...@@ -14,5 +33,69 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer { ...@@ -14,5 +33,69 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer {
public static void main(String[] args) { public static void main(String[] args) {
SpringApplication.run(CrawlerStaticApplication.class, args); SpringApplication.run(CrawlerStaticApplication.class, args);
} }
public void run(String... args) throws Exception {
// System.out.println("——————++++++++++++——————===");
// String key="{\n" +
// " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.KeyWordsDTO\",\n" +
// " \"id\": \"1513415223147425794\",\n" +
// " \"wordsCode\": \"KW-20220411-0003\",\n" +
// " \"wordsName\": \"链长\",\n" +
// " \"keyWord\": \"链长\",\n" +
// " \"exclusionWord\": null,\n" +
// " \"status\": \"1\",\n" +
// " \"subjectId\": null,\n" +
// " \"subjectIds\": null,\n" +
// " \"startTime\": \"1607443200000\",\n" +
// " \"endTime\": null\n" +
// "}";
// KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class);
// MetaSouGouSearchThread metaSearchThread=new MetaSouGouSearchThread();
// metaSearchThread.keywordMsg=keywordMsg;
// metaSearchThread.crawler();
consumerKeyword ();
}
public void consumerKeyword (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
MetaYahooSearchThread metaSearchThread=new MetaYahooSearchThread();
metaSearchThread.keywordMsg=keywordMsg;
metaSearchThread.crawler();
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
} }
\ No newline at end of file
...@@ -50,7 +50,7 @@ public class KafkaConsumerYahooTask { ...@@ -50,7 +50,7 @@ public class KafkaConsumerYahooTask {
} }
//打包编译时修改定时启动的任务 如果是搜索关键词放开consumerKeyword 如果是内容解析放开consumerDetailUrl 上面的定时任务 //打包编译时修改定时启动的任务 如果是搜索关键词放开consumerKeyword 如果是内容解析放开consumerDetailUrl 上面的定时任务
@Scheduled(cron = "0 0/2 * * * ?") // @Scheduled(cron = "0 0/2 * * * ?")
@Async("webExecutor") @Async("webExecutor")
public void consumerKeyword (){ public void consumerKeyword (){
log.info("定时获取mq消息"); log.info("定时获取mq消息");
......
...@@ -19,7 +19,7 @@ import java.util.Map; ...@@ -19,7 +19,7 @@ import java.util.Map;
* *
*/ */
@Configuration @Configuration
@EnableKafka //@EnableKafka
public class KafkaProducerConfig { public class KafkaProducerConfig {
@Value("${kafka.producer.servers}") @Value("${kafka.producer.servers}")
......
...@@ -420,6 +420,7 @@ public class MetaYahooSearchThread implements Runnable { ...@@ -420,6 +420,7 @@ public class MetaYahooSearchThread implements Runnable {
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
clbAnsProcessitem.setPublishDate(docInfo.getPublishDate()); clbAnsProcessitem.setPublishDate(docInfo.getPublishDate());
clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress()); clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress());
clbAnsProcessitem.setSource("8");
return clbAnsProcessitem; return clbAnsProcessitem;
} }
......
...@@ -29,5 +29,9 @@ public class KeywordMsg { ...@@ -29,5 +29,9 @@ public class KeywordMsg {
private Long startTime; private Long startTime;
private Long endTime; private Long endTime;
//需要启动的信息采集器
private List<String> searchEngines;
//采集的要求(1:标题 2:正文 3:全文)
private String crawlerType;
} }
...@@ -362,7 +362,7 @@ public class DetailYahooSearchThread implements Runnable { ...@@ -362,7 +362,7 @@ public class DetailYahooSearchThread implements Runnable {
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
clbAnsProcessitem.setPublishDate(docInfo.getPublishDate()); clbAnsProcessitem.setPublishDate(docInfo.getPublishDate());
clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress()); clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress());
clbAnsProcessitem.setSource("google搜索"); clbAnsProcessitem.setSource("8");
return clbAnsProcessitem; return clbAnsProcessitem;
} }
//转换qq新闻链接 //转换qq新闻链接
......
...@@ -21,7 +21,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 ...@@ -21,7 +21,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#消费主题 #消费主题
KAFKA_CONSUMER_TOPIC=keyWordsInfo KAFKA_CONSUMER_TOPIC=keyWordsInfo
#消费者 #消费者
KAFKA_CONSUMER_GROUP_ID=google-sync KAFKA_CONSUMER_GROUP_ID=yahoo-sync
#kafka消费信息模式 #kafka消费信息模式
KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
#信息发送主题 #信息发送主题
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论