提交 afb6cfd1 作者: 薛凌堃

企业公告脚本测试

上级 3e7c3b28
import json import json
...@@ -120,7 +120,7 @@ def tableUpdate(retData, com_name, year, pdf_name, num): ...@@ -120,7 +120,7 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
values = ( values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by, status, create_by,
create_time, page_size,path,'zzsn') create_time, page_size,full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1],'zzsn')
cursor_.execute(Upsql, values) # 插入 cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交 cnx_.commit() # 提交
except Exception as e: except Exception as e:
...@@ -283,14 +283,14 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -283,14 +283,14 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
#判断文件是否已经存在obs服务器中 #判断文件是否已经存在obs服务器中
# file_path = 'QYNotice//浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告' # file_path = 'QYNotice//浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告'
now_time = time.strftime("%Y-%m") now_time = time.strftime("%Y-%m")
file_path = 'QYNotice/'+pdf_name # file_path = 'QYNotice/'+pdf_name
response = obsClient.getObjectMetadata('zzsn', file_path) # response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300: # if response.status >= 300:
log.info('=====文件不存在obs=====') # log.info('=====文件不存在obs=====')
pass # pass
else: # else:
log.info(f'=====文件存在obs========{file_path}') # log.info(f'=====文件存在obs========{file_path}')
return False # return False
#上传至华为云服务器 #上传至华为云服务器
retData = uptoOBS(pdf_url,pdf_name,8,social_code) retData = uptoOBS(pdf_url,pdf_name,8,social_code)
#附件插入att数据库 #附件插入att数据库
...@@ -323,7 +323,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -323,7 +323,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
'sid': '1684032033495392257', 'sid': '1684032033495392257',
'sourceAddress': pdf_url, # 原文链接 'sourceAddress': pdf_url, # 原文链接
'summary': '', 'summary': '',
'title': pdf_name, 'title': pdf_name.replace('.pdf',''),
'type': 3, 'type': 3,
'socialCreditCode': social_code, 'socialCreditCode': social_code,
'year': year 'year': year
...@@ -332,7 +332,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -332,7 +332,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
# 将相应字段通过kafka传输保存 # 将相应字段通过kafka传输保存
try: try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("researchReportTopicaaaas", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10)) print(kafka_result.get(timeout=10))
...@@ -430,6 +430,7 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -430,6 +430,7 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
# 判断数据库中是否有该条资讯 # 判断数据库中是否有该条资讯
ifexist = ifInstert(short_name, social_code, pdf_url) ifexist = ifInstert(short_name, social_code, pdf_url)
#如果不存在 ifexist = True #如果不存在 ifexist = True
# ifexist = True
if ifexist: if ifexist:
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败 # 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num) result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论