提交 eff87695 作者: 薛凌堃

企查查基本信息采集维护

上级 9b2d7df4
...@@ -76,7 +76,13 @@ def baseinfo(com_soup): ...@@ -76,7 +76,13 @@ def baseinfo(com_soup):
value = cominfo.find('span', class_='val').text.replace('复制', '').strip(' ') value = cominfo.find('span', class_='val').text.replace('复制', '').strip(' ')
except: except:
try: try:
value = cominfo.find('span', class_='val next-tick-copy-value').text.replace('复制', '').strip(' ') value_tags = cominfo.find_all('span')
for _ in value_tags:
if len(_.attrs) == 0:
value = _.text.replace('复制', '').strip(' ')
break
else:
return data
except: except:
return data return data
pattern = r'\(\d{4}\s*年\)' pattern = r'\(\d{4}\s*年\)'
...@@ -97,20 +103,20 @@ def baseinfo(com_soup): ...@@ -97,20 +103,20 @@ def baseinfo(com_soup):
return data return data
# 检查登陆状态 # 检查登陆状态
def checklogin(key): # def checklogin(key):
#
# url = f'https://www.qcc.com/web/search?key=91110108558521630L' # # url = f'https://www.qcc.com/web/search?key=91110108558521630L'
url = f'https://www.qcc.com/web/search?key={key}' # url = f'https://www.qcc.com/web/search?key={key}'
# ip = baseCore.get_proxy() # # ip = baseCore.get_proxy()
# req = requests.get(headers=headers, url=url, proxies=ip) # # req = requests.get(headers=headers, url=url, proxies=ip)
req = requests.get(headers=headers, url=url) # req = requests.get(headers=headers, url=url)
time.sleep(1) # time.sleep(1)
soup = BeautifulSoup(req.content, 'html.parser') # soup = BeautifulSoup(req.content, 'html.parser')
if soup.find('title').text == '会员登录 - 企查查': # if soup.find('title').text == '会员登录 - 企查查':
log.info('状态---未登录') # log.info('状态---未登录')
soup = '' # soup = ''
return soup # return soup
return soup # return soup
# 处理要发送的字段 # 处理要发送的字段
def dic_handle(result_dic): def dic_handle(result_dic):
...@@ -333,20 +339,21 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -333,20 +339,21 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
# company_id = dic_info[12] # company_id = dic_info[12]
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码 # 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if social_code: if social_code:
# soup = checklogin(social_code)
url = f'https://www.qcc.com/web/search?key={social_code}' url = f'https://www.qcc.com/web/search?key={social_code}'
driver.get(url) driver.get(url)
page_source = driver.page_source page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser') soup = BeautifulSoup(page_source, 'html.parser')
else: else:
soup = '' url = f'https://www.qcc.com/web/search?key={com_name}'
# soup = checklogin(com_name) driver.get(url)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
pass pass
if not soup: if not soup:
log.info("登录失效===重新放入redis") log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field) baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# token.delete_token(id_cookie) # token.delete_token(id_cookie)
log.info('=====已重新放入redis,失效cookies已删除======') # log.info('=====已重新放入redis,失效cookies已删除======')
time.sleep(20) time.sleep(20)
return count return count
else: else:
...@@ -355,7 +362,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -355,7 +362,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
except: except:
log.info("登录失效===重新放入redis") log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field) baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# token.updateTokeen(id_cookie,2) token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======') log.info('=====已重新放入redis,cookies已封号======')
time.sleep(20) time.sleep(20)
return count return count
...@@ -371,22 +378,25 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -371,22 +378,25 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name): if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
count += 1 count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}') log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
# token.updateTokeen(id_cookie,3) token.updateTokeen(id_cookie,3)
return count return count
else: else:
return count return count
except Exception as e: except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====') log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field) baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# token.updateTokeen(id_cookie,2) token.updateTokeen(id_cookie,2)
# log.info('=====已重新放入redis,cookies已封号======') log.info('=====已重新放入redis,cookies已封号======')
return count return count
def ifbeforename(company_url): def ifbeforename(company_url):
req_ = requests.get(headers=headers, url=company_url) # req_ = requests.get(headers=headers, url=company_url)
com_soup = BeautifulSoup(req_.content, 'html.parser') # com_soup = BeautifulSoup(req_.content, 'html.parser')
driver.get(company_url)
page_source_2 = driver.page_source
com_soup = BeautifulSoup(page_source_2, 'html.parser')
try: try:
businessinfo = com_soup.find('div', class_='cominfo-normal') businessinfo = com_soup.find('div', class_='cominfo-normal')
except: except:
...@@ -409,8 +419,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -409,8 +419,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'" updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql) cursor_.execute(updateSql)
cnx_.commit() cnx_.commit()
# ip = baseCore.get_proxy()
# req_ = requests.get(headers=headers, url=company_url, proxies=ip)
# req_ = requests.get(headers=headers, url=company_url) # req_ = requests.get(headers=headers, url=company_url)
# com_soup = BeautifulSoup(req_.content, 'html.parser') # com_soup = BeautifulSoup(req_.content, 'html.parser')
...@@ -571,17 +579,17 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -571,17 +579,17 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
def login(): def login():
driver = create_driver() driver = create_driver()
url = 'https://www.qcc.com/' url = 'https://www.qcc.com'
driver.get(url) driver.get(url)
driver.maximize_window() driver.maximize_window()
from selenium.webdriver.support import expected_conditions as EC # from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10) # wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "nav-item"))) # wait.until(EC.presence_of_element_located((By.CLASS_NAME, "nav-item")))
# page_source = browser.page_source # # page_source = browser.page_source
# soup = BeautifulSoup(page_source,'html.parser') # # soup = BeautifulSoup(page_source,'html.parser')
# print(soup) # # print(soup)
driver.find_element(By.CLASS_NAME, 'nav-item').click() # driver.find_element(By.CLASS_NAME, 'nav-item').click()
time.sleep(10) # time.sleep(10)
# wait = WebDriverWait(driver, 10) # wait = WebDriverWait(driver, 10)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "login-change"))) # wait.until(EC.presence_of_element_located((By.CLASS_NAME, "login-change")))
# driver.find_element(By.CLASS_NAME, 'login-change').click() # driver.find_element(By.CLASS_NAME, 'login-change').click()
...@@ -590,43 +598,53 @@ def login(): ...@@ -590,43 +598,53 @@ def login():
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[2]/input').send_keys('angel2468') # driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[2]/input').send_keys('angel2468')
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[4]/button').click() # driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[4]/button').click()
# time.sleep(3) # time.sleep(3)
cookie_list = driver.get_cookies() # cookie_list = driver.get_cookies()
# cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721790462, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'secure': False, 'value': '1642640529-1706065651-%7C1706065663'}, {'domain': '.qcc.com', 'expiry': 1792465649, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'a56c994f-851b-4d6f-964f-80896160c221'}, {'domain': '.qcc.com', 'expiry': 1706670461.146448, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'secure': False, 'value': '15fbea36e490d86bda4ba24353'}, {'domain': '.qcc.com', 'expiry': 1721790450, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'secure': False, 'value': '18d396fe41533d-04b6782077b01c-313f68-e1000-18d396fe416778'}, {'domain': 'www.qcc.com', 'expiry': 1706067447.840599, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'secure': False, 'value': '3d365a3017060656472424474e1ed648e1b2a8b72216b66d27de7566e1'}] cookieinfo = token.getToken()
if cookieinfo:
pass
else:
log.info('==========已无cookies==========')
time.sleep(30)
return
id_cookie = cookieinfo[0]
cookie_ = json.loads(cookieinfo[1])
cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721815475, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': f'{cookie_["CNZZDATA1254842228"]}'}, {'domain': '.qcc.com', 'expiry': 1740650660, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'bb480035-2a34-4270-9a8b-db8b7d9374b3'}, {'domain': '.qcc.com', 'expiry': 1706695474, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'ccf17b97219476a1faa8aaff79'}, {'domain': '.qcc.com', 'expiry': 1721815461, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '18d3aed87f3552-01ba17134bcbe9-4c657b58-e1000-18d3aed87f4c5d'}, {'domain': 'www.qcc.com', 'expiry': 1706092459, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '3d365a1c17060906591851865e848bfd116d30ed8d2ac3e144455c8ff8'}]
for cookie in cookie_list: for cookie in cookie_list:
driver.add_cookie(cookie) driver.add_cookie(cookie)
return driver time.sleep(5)
url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
driver.get(url_test)
return driver,id_cookie
if __name__ == '__main__': if __name__ == '__main__':
taskType = '基本信息/企查查' taskType = '基本信息/企查查'
driver, id_cookie = login()
while True: while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8] nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况.xlsx' file_name = f'./data/国内企业基本信息采集情况.xlsx'
print(file_name)
file.createFile(file_name) file.createFile(file_name)
# headers = {
driver = login() # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding': 'gzip, deflate, br',
headers = { # 'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', # 'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br', # # 'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411',
'Accept-Language': 'zh-CN,zh;q=0.9', # # 'Cookie': f'qcc_did={cookie_["qcc_did"]}; acw_tc={cookie_["acw_tc"]}; QCCSESSID={cookie_["QCCSESSID"]}',
'Connection': 'keep-alive', # 'Host': 'www.qcc.com',
# 'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411', # 'Referer': 'https://www.qcc.com/',
# 'Cookie': f'qcc_did={cookie_["qcc_did"]}; acw_tc={cookie_["acw_tc"]}; QCCSESSID={cookie_["QCCSESSID"]}', # 'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Host': 'www.qcc.com', # 'Sec-Ch-Ua-Mobile': '?0',
'Referer': 'https://www.qcc.com/', # 'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', # 'Sec-Fetch-Dest': 'document',
'Sec-Ch-Ua-Mobile': '?0', # 'Sec-Fetch-Mode': 'navigate',
'Sec-Ch-Ua-Platform': '"Windows"', # 'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Dest': 'document', # 'Sec-Fetch-User': '?1',
'Sec-Fetch-Mode': 'navigate', # 'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Site': 'same-origin', # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
'Sec-Fetch-User': '?1', # }
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
...@@ -640,7 +658,7 @@ if __name__ == '__main__': ...@@ -640,7 +658,7 @@ if __name__ == '__main__':
if company_field == '' or company_field is None: if company_field == '' or company_field is None:
# 本轮结束后没有新增的企业要采集 # 本轮结束后没有新增的企业要采集
# file.deleteFile(file_name) file.deleteFile(file_name)
flag = True flag = True
while flag: while flag:
log.info('--------已没有数据---------') log.info('--------已没有数据---------')
...@@ -676,7 +694,7 @@ if __name__ == '__main__': ...@@ -676,7 +694,7 @@ if __name__ == '__main__':
# listingDate = '' # listingDate = ''
# category = '' # category = ''
# exchange = '' # exchange = ''
file_name = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name) count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
time.sleep(10) time.sleep(10)
# break # break
......
...@@ -389,9 +389,9 @@ def ifbeforename(company_url): ...@@ -389,9 +389,9 @@ def ifbeforename(company_url):
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name): def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('firm/')[1].split('.html')[0] qccid = company_url.split('firm/')[1].split('.html')[0]
# 将采集到的企查查id更新 # 将采集到的企查查id更新
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'" # updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql) # cursor_.execute(updateSql)
cnx_.commit() # cnx_.commit()
# ip = baseCore.get_proxy() # ip = baseCore.get_proxy()
# req_ = requests.get(headers=headers, url=company_url, proxies=ip) # req_ = requests.get(headers=headers, url=company_url, proxies=ip)
req_ = requests.get(headers=headers, url=company_url) req_ = requests.get(headers=headers, url=company_url)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论