提交 cdc4a715 作者: 薛凌堃

独角兽榜单基本信息

上级 41c6aaa2
......@@ -9,19 +9,14 @@ import json
from kafka import KafkaProducer
from base.BaseCore import BaseCore
from getQccId import find_id_by_name
from base.BaseCore import BaseCore
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
baseCore = BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
# 通过企查查id获取企业基本信息
def info_by_id(com_id,com_name):
def info_by_id(com_id,com_name,social_code):
aa_dict_list = []
t = str(int(time.time()) * 1000)
......@@ -29,14 +24,17 @@ def info_by_id(com_id,com_name):
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
log.info(resp_dict)
time.sleep(2)
com_jc_name = ''
try:
result_dict = resp_dict['result']['Company']
except:
print(com_name + ":获取失败")
#
log.info(com_name + ":获取失败===========重新放入redis")
baseCore.rePutIntoR('dujs_1020:baseinfo_socialcode',social_code)
return aa_dict_list
company_name = result_dict['Name']
CreditCode = result_dict['CreditCode']
if CreditCode is None:
......@@ -309,11 +307,12 @@ def info_by_id(com_id,com_name):
}
aa_dict_list.append(aa_dict)
print(company_name + ":爬取完成")
log.info(company_name + ":爬取完成")
return aa_dict_list
if __name__ == '__main__':
taskType = '基本信息/企查查/福布斯'
taskType = '基本信息/企查查/单项双百企业冠军'
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
......@@ -325,54 +324,73 @@ if __name__ == '__main__':
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
list_weicha = []
name_list = []
#从redis里拿数据
while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken()
list_weicha = []
list_all_info = []
name_list = []
dataList = []
if token:
pass
else:
log.info('==========已无token==========')
time.sleep(30)
continue
# list_all_info = []
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('BaseInfoEnterpriseFbs:gnqy_social_code')
# social_code = '91110000710924945A'
if social_code is None:
# social_code = baseCore.redicPullData('dujs_1020:baseinfo_socialcode')
social_code = '91310115067758342E'
if social_code == '' or social_code is None:
time.sleep(20)
continue
log.info(f'----当前企业{social_code}-----')
dic_info = baseCore.getInfomation(social_code)
#
count = dic_info[13]
log.info(f'----当前企业{social_code}--开始处理---')
count = dic_info[14]
com_name = dic_info[1]
social_code = dic_info[2]
# 企查查id
company_id = dic_info[12]
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == None:
if social_code:
company_id = find_id_by_name(start_time, token, social_code)
company_id = find_id_by_name(start_time,token,social_code)
else:
company_id = find_id_by_name(start_time, token, com_name)
# todo:写入数据库
updateSql = f"update EnterpriseInfo set QCCID = '{company_id}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql)
cnx_.commit()
post_data_list = info_by_id(company_id, com_name)
if company_id == "":
print(com_name + ":企业ID获取失败")
list_weicha.append(com_name + ":企业ID获取失败")
company_id = find_id_by_name(start_time,token,com_name)
if company_id == 'null':
log.info('=====搜索不到该企业====')
#todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
baseCore.rePutIntoR('dujs_1020:baseinfo_socialcode', social_code + ':搜索不到')
continue
else:
log.info(f'====={social_code}===={company_id}=====获取企业id成功=====')
try:
post_data_list = info_by_id(company_id, com_name)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('BaseInfoEnterpriseFbs:gnqy_social_code', social_code)
if not company_id:
log.info(social_code + ":企业ID获取失败===重新放入redis")
list_weicha.append(social_code + ":企业ID获取失败")
baseCore.rePutIntoR('dujs_1020:baseinfo_socialcode',social_code)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
time.sleep(20)
continue
else:
log.info(f'====={com_name}===={company_id}=====获取企业id成功=====')
# todo:写入数据库
updateSql = f"update EnterpriseInfo set QCCID = '{company_id}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql)
cnx_.commit()
try:
post_data_list = info_by_id(company_id, com_name, social_code)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('dujs_1020:baseinfo_socialcode', social_code)
continue
if post_data_list:
pass
else:
log.info(f'======{social_code}====企查查token失效====')
time.sleep(20)
continue
for post_data in post_data_list:
list_all_info.append(post_data)
if post_data is None:
print(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败")
......@@ -396,17 +414,18 @@ if __name__ == '__main__':
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
# break
# 信息采集完成后将该企业的采集次数更新
runType = 'BaseInfoRunCount'
count += 1
baseCore.updateRun(social_code, runType, count)
nowtime = baseCore.getNowTime(1).replace('-', '_')[:10]
break
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx', index=False)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx', index=False)
baseCore.close()
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论