Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
8392fb08
提交
8392fb08
authored
8月 12, 2023
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Changes
上级
289b93d8
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
170 行增加
和
133 行删除
+170
-133
雅虎财经_企业基本信息_高管信息.py
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
+170
-133
没有找到文件。
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
浏览文件 @
8392fb08
impor
t
json
impor
t
json
import
json
import
json
import
time
import
time
import
numpy
as
np
import
pandas
as
pd
import
pandas
as
pd
import
pymysql
import
requests
import
requests
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
...
@@ -11,7 +13,8 @@ from base.BaseCore import BaseCore
...
@@ -11,7 +13,8 @@ from base.BaseCore import BaseCore
import
urllib3
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
headers
=
{
headers
=
{
...
@@ -31,37 +34,41 @@ headers = {
...
@@ -31,37 +34,41 @@ headers = {
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
}
# 获取股票代码
# 获取股票代码
def
getGpdm
(
name
):
#
def getGpdm(name):
start
=
time
.
time
()
#
start=time.time()
gpdm
=
""
#
gpdm=""
try
:
#
try:
#
url
=
f
'https://query1.finance.yahoo.com/v1/finance/search?q={name}&lang=en-US®ion=US"esCount=6&newsCount=2&listsCount=2&enableFuzzyQuery=false"esQueryId=tss_match_phrase_query&multiQuoteQueryId=multi_quote_single_token_query&newsQueryId=news_cie_vespa&enableCb=true&enableNavLinks=true&enableEnhancedTrivialQuery=true&enableResearchReports=true&enableCulturalAssets=true&enableLogoUrl=true&researchReportsCount=2'
#
url = f'https://query1.finance.yahoo.com/v1/finance/search?q={name}&lang=en-US®ion=US"esCount=6&newsCount=2&listsCount=2&enableFuzzyQuery=false"esQueryId=tss_match_phrase_query&multiQuoteQueryId=multi_quote_single_token_query&newsQueryId=news_cie_vespa&enableCb=true&enableNavLinks=true&enableEnhancedTrivialQuery=true&enableResearchReports=true&enableCulturalAssets=true&enableLogoUrl=true&researchReportsCount=2'
response
=
requests
.
get
(
url
,
headers
=
headers
,
verify
=
False
,
timeout
=
(
3.05
,
3
))
#
response = requests.get(url, headers=headers, verify=False,timeout=(3.05, 3))
time
.
sleep
(
3
)
#
time.sleep(3)
except
:
#
except:
return
gpdm
#
return gpdm
if
(
response
.
status_code
==
200
):
#
if (response.status_code == 200):
pass
#
pass
else
:
#
else:
log
.
error
(
f
"{name}------获取股票接口返回失败:{response.status_code}"
)
#
log.error(f"{name}------获取股票接口返回失败:{response.status_code}")
return
gpdm
#
return gpdm
retJson
=
json
.
loads
(
response
.
content
.
decode
(
'utf-8'
))
#
retJson = json.loads(response.content.decode('utf-8'))
try
:
#
try:
gpdm
=
retJson
[
'quotes'
][
0
][
'symbol'
]
#
gpdm =retJson['quotes'][0]['symbol']
except
:
#
except:
log
.
error
(
f
"{name}---获取股票代码异常"
)
#
log.error(f"{name}---获取股票代码异常")
return
gpdm
#
return gpdm
log
.
info
(
f
"获取股票代码--{name},耗时{baseCore.getTimeCost(start, time.time())}"
)
#
log.info(f"获取股票代码--{name},耗时{baseCore.getTimeCost(start, time.time())}")
#
return
gpdm
#
return gpdm
# 根据股票代码 获取企业基本信息 高管信息
# 根据股票代码 获取企业基本信息 高管信息
def
getInfo
(
name
,
gpdm
,
xydm
):
def
getInfo
(
gpdm
,
xydm
):
gpdm_
=
gpdm
if
'HK'
in
gpdm_
:
gpdm_
=
gpdm_
start
=
time
.
time
()
start
=
time
.
time
()
retData
=
{}
retData
=
{}
retData
[
'base_info'
]
=
{
retData
[
'base_info'
]
=
{
'公司名称'
:
name
,
'公司名称'
:
''
,
'英文名'
:
''
,
'信用代码'
:
xydm
,
'信用代码'
:
xydm
,
'股票代码'
:
gpdm
,
'股票代码'
:
gpdm
,
'地址'
:
''
,
'地址'
:
''
,
...
@@ -128,7 +135,8 @@ def getInfo(name,gpdm,xydm):
...
@@ -128,7 +135,8 @@ def getInfo(name,gpdm,xydm):
except
:
except
:
com_jianjie
=
''
com_jianjie
=
''
dic_com_info
=
{
dic_com_info
=
{
'公司名称'
:
name
,
'公司名称'
:
''
,
'英文名'
:
''
,
'信用代码'
:
xydm
,
'信用代码'
:
xydm
,
'股票代码'
:
gpdm
,
'股票代码'
:
gpdm
,
'地址'
:
com_address
,
'地址'
:
com_address
,
...
@@ -178,7 +186,7 @@ def getInfo(name,gpdm,xydm):
...
@@ -178,7 +186,7 @@ def getInfo(name,gpdm,xydm):
if
(
p_year
==
"N/A"
):
if
(
p_year
==
"N/A"
):
p_year
=
""
p_year
=
""
dic_main_people
=
{
dic_main_people
=
{
'公司名称'
:
name
,
'公司名称'
:
''
,
'股票代码'
:
gpdm
,
'股票代码'
:
gpdm
,
'信用代码'
:
xydm
,
'信用代码'
:
xydm
,
'姓名'
:
p_name
,
'姓名'
:
p_name
,
...
@@ -191,52 +199,55 @@ def getInfo(name,gpdm,xydm):
...
@@ -191,52 +199,55 @@ def getInfo(name,gpdm,xydm):
retData
[
'people_info'
]
=
retPeople
retData
[
'people_info'
]
=
retPeople
df_retData
=
pd
.
DataFrame
(
retPeople
)
df_retData
=
pd
.
DataFrame
(
retPeople
)
# df_a = pd.DataFrame(retData['base_info'])
# df_a = pd.DataFrame(retData['base_info'])
df_retData
.
to_excel
(
'采集高管结果1.xlsx'
,
index
=
False
)
df_retData
.
to_excel
(
'
./data/
采集高管结果1.xlsx'
,
index
=
False
)
log
.
info
(
f
"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}"
)
log
.
info
(
f
"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}"
)
return
retData
return
retData
def
Nongpdm
(
xydm
,
name
,
officialUrl
,
industry
,
englishName
,
address
):
# def Nongpdm(xydm,name):
# start = time.time()
# company_dict = {
# 'name': name, # 企业名称
# 'shortName': '', # 企业简称
# 'socialCreditCode': xydm, # 统一社会信用代码
# 'officialPhone': '', # 电话
# 'officialUrl': '', # 官网
# 'briefInfo': '', # 简介
# 'industry': '', # 所属行业
# 'englishName': name, # 英文名
# 'address': '', # 地址
# 'status': 0, # 状态
# }
# # print(company_dict)
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
# kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
# kafka_result.get(timeout=10)
# # log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
# log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}")
# return company_dict
#
#保存基本信息
def
saveBaseInfo
(
info
):
start
=
time
.
time
()
start
=
time
.
time
()
#基本信息发送到kafka
company_dict
=
{
company_dict
=
{
'name'
:
name
,
# 企业名称
'name'
:
''
,
# 企业名称
'shortName'
:
''
,
# 企业简称
'shortName'
:
''
,
# 企业简称
'socialCreditCode'
:
xydm
,
# 统一社会信用代码
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
]
,
# 统一社会信用代码
'officialPhone'
:
''
,
# 电话
'officialPhone'
:
info
[
'base_info'
][
'电话'
]
,
# 电话
'officialUrl'
:
officialUrl
,
# 官网
'officialUrl'
:
info
[
'base_info'
][
'公司网站'
]
,
# 官网
'briefInfo'
:
''
,
# 简介
'briefInfo'
:
info
[
'base_info'
][
'公司简介'
]
,
# 简介
'industry'
:
in
dustry
,
# 所属行业
'industry'
:
in
fo
[
'base_info'
][
'行业'
]
,
# 所属行业
'englishName'
:
englishName
,
# 英文名
'englishName'
:
info
[
'base_info'
][
'英文名'
]
,
# 英文名
'address'
:
address
,
# 地址
'address'
:
info
[
'base_info'
][
'地址'
]
,
# 地址
'status'
:
0
,
# 状态
'status'
:
0
,
# 状态
}
}
# print(company_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
company_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
company_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_result
.
get
(
timeout
=
10
)
kafka_result
.
get
(
timeout
=
10
)
#
log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
log
.
info
(
f
"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}"
)
log
.
info
(
f
"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}"
)
log
.
info
(
f
"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}"
)
return
company_dict
#保存基本信息
# def saveBaseInfo(info):
# start = time.time()
# #基本信息发送到kafka
# company_dict = {
# 'name': info['base_info']['公司名称'], # 企业名称
# 'shortName': info['base_info']['公司名称'], # 企业简称
# 'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码
# 'officialPhone': info['base_info']['电话'], # 电话
# 'officialUrl': info['base_info']['公司网站'], # 官网
# 'briefInfo': info['base_info']['公司简介'], # 简介
# 'industry': info['base_info']['行业'], # 所属行业
# 'englishName': info['base_info']['公司名称'], # 英文名
# 'address': info['base_info']['地址'], # 地址
# 'status': 0, # 状态
# }
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
# kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
# kafka_result.get(timeout=10)
# log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
# # log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}")
#保存高管信息
#保存高管信息
def
savePeopleInfo
(
info
):
def
savePeopleInfo
(
info
):
...
@@ -262,8 +273,9 @@ def savePeopleInfo(info):
...
@@ -262,8 +273,9 @@ def savePeopleInfo(info):
}
}
list_one_info
.
append
(
dic_json
)
list_one_info
.
append
(
dic_json
)
json_updata
=
json
.
dumps
(
list_one_info
)
json_updata
=
json
.
dumps
(
list_one_info
)
# print(json_updata)
if
json_updata
==
'[]'
:
if
json_updata
==
'[]'
:
pass
pass
else
:
else
:
for
i
in
range
(
0
,
3
):
for
i
in
range
(
0
,
3
):
response
=
requests
.
post
(
'http://114.115.236.206:9988/datapull/sync/executive'
,
data
=
json_updata
,
timeout
=
300
,
verify
=
False
)
response
=
requests
.
post
(
'http://114.115.236.206:9988/datapull/sync/executive'
,
data
=
json_updata
,
timeout
=
300
,
verify
=
False
)
...
@@ -298,74 +310,99 @@ def beginWork():
...
@@ -298,74 +310,99 @@ def beginWork():
#给定excel名单 保存股票代码
#给定excel名单 保存股票代码
okCount
=
0
okCount
=
0
errorCount
=
0
errorCount
=
0
df_all_xydm
=
pd
.
read_excel
(
'../../data/工作簿1.xlsx'
,
dtype
=
str
,
keep_default_na
=
False
)
df_all
=
pd
.
read_excel
(
'./data/福布斯美国企业股票代码.xlsx'
,
dtype
=
str
,
keep_default_na
=
False
)
df_all
=
pd
.
read_excel
(
'../../data/23年500强企业新榜股票代码.xlsx'
,
dtype
=
str
,
keep_default_na
=
False
)
a
=
[]
for
i
in
range
(
len
(
df_all_xydm
)):
b
=
[]
# name = df_all['中文名称'][i]
# df_all = pd.read_excel('../../data/23年500强企业新榜股票代码.xlsx',dtype=str, keep_default_na=False)
# rank = df_all['排名'][i]
for
i
in
range
(
len
(
df_all
)):
# officialUrl = df_all['企业官网'][i]
# # name = df_all['中文名称'][i]
# industry = df_all['行业'][i]
# # rank = df_all['排名'][i]
# englishName = df_all['英文名称'][i]
# # officialUrl = df_all['企业官网'][i]
# address = df_all['企业总部地址'][i]
# # industry = df_all['行业'][i]
# # englishName = df_all['英文名称'][i]
xydm_name
=
df_all_xydm
[
'名称'
][
i
]
# # address = df_all['企业总部地址'][i]
# print(xydm_name)
#
for
j
in
range
(
len
(
df_all
)):
# xydm_name = df_all_xydm['名称'][i]
name
=
df_all
[
'中文名称'
][
j
]
# # print(xydm_name)
if
name
==
xydm_name
:
# for j in range(len(df_all)):
print
(
name
,
xydm_name
)
# name = df_all['中文名称'][j]
xydm
=
df_all_xydm
[
'信用代码'
][
i
]
# if name == xydm_name:
if
i
>=
22
:
# print(name,xydm_name)
pass
# xydm = df_all_xydm['信用代码'][i]
else
:
# if i>=22:
continue
# pass
log
.
info
(
f
"{i}----------开始"
)
# else:
# country = df_all['企业所属国家'][i]
# continue
# if country=='中国':
# log.info(f"{i}----------开始")
# continue
# # country = df_all['企业所属国家'][i]
# else:
# # if country=='中国':
# log.info(f"{i}----------为国外企业 继续")
# # continue
gpdm
=
df_all
[
'股票代码'
][
j
]
# # else:
#没有股票代码,就保存榜单中的数据
# # log.info(f"{i}----------为国外企业 继续")
if
gpdm
==
''
:
name
=
df_all
[
'name'
][
i
]
continue
ename
=
df_all
[
'companyName'
][
i
]
# xydm = baseCore.getNextXydm()
gpdm
=
df_all
[
'code'
][
i
]
# Nongpdm(xydm,name,officialUrl,industry,englishName,address)
xydm
=
df_all
[
'xydm'
][
i
]
else
:
# if gpdm == '':
log
.
info
(
f
"{j}----------为股票代码不为空 继续"
)
# Nongpdm(xydm,name)
pass
# continue
enname
=
df_all
[
'英文名称'
][
j
]
# # print(f'名字:{name},股票代码:{gpdm},信用代码:{xydm}')
if
enname
!=
''
:
# # #没有股票代码,就保存榜单中的数据
pass
# if gpdm == '':
else
:
# Nongpdm(gpdm,name)
log
.
info
(
f
"{j}----------英文名字为空 跳过"
)
# continue
continue
# # xydm = baseCore.getNextXydm()
# log.info(f"{i}----------开始股票代码")
# # # Nongpdm(xydm,name,officialUrl,industry,englishName,address)
# gpdm = getGpdm(enname)
# # else:
# xydm=baseCore.getNextXydm()
# # log.info(f"{j}----------为股票代码不为空 继续")
# # pass
retData
=
getInfo
(
enname
,
gpdm
,
xydm
)
# # enname = df_all['英文名称'][j]
# saveBaseInfo(retData)
# # if enname != '':
savePeopleInfo
(
retData
)
# # pass
#也可以去采集企业动态
# # else:
news
(
j
,
gpdm
,
xydm
)
# # log.info(f"{j}----------英文名字为空 跳过")
# # continue
if
gpdm
!=
''
:
# # # log.info(f"{i}----------开始股票代码")
okCount
=
okCount
+
1
# # # gpdm = getGpdm(enname)
else
:
# # # xydm=baseCore.getNextXydm()
errorCount
=
errorCount
+
1
# #
log
.
info
(
f
"{j}-------成功{okCount}--失败-{errorCount}"
)
try
:
if
gpdm
==
''
:
retData
=
getInfo
(
name
,
ename
,
gpdm
,
xydm
)
continue
saveBaseInfo
(
retData
)
else
:
savePeopleInfo
(
retData
)
pass
#采集企业动态
df_all
[
'股票代码'
][
j
]
=
gpdm
news
(
i
,
gpdm
,
xydm
)
else
:
okCount
+=
1
continue
b
.
append
([
name
,
gpdm
,
xydm
])
if
(
i
%
10
==
0
):
print
(
name
,
'.......采集成功'
)
df_all
.
to_excel
(
r'..\..\data\23年500强企业新上榜_ret22.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
except
:
df_all
.
to_excel
(
r'..\..\data\23年500强企业新榜_ret22.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
a
.
append
([
name
,
gpdm
,
xydm
])
# 释放资源
errorCount
+=
1
print
(
name
,
'......采集失败'
)
print
(
f
'成功{okCount}个,失败{errorCount}个'
)
df_a
=
pd
.
DataFrame
(
np
.
array
(
a
))
df_a
.
columns
=
[
'name'
,
'gpdm'
,
'xydm'
]
df_a
.
to_excel
(
'./data/没有采集到.xlsx'
,
index
=
False
)
df_b
=
pd
.
DataFrame
(
np
.
array
(
b
))
df_b
.
columns
=
[
'name'
,
'gpdm'
,
'xydm'
]
df_b
.
to_excel
(
'./data/采集到.xlsx'
,
index
=
False
)
#
# if gpdm!='':
# okCount=okCount+1
# else:
# errorCount=errorCount+1
# log.info(f"{i}-------成功{okCount}--失败-{errorCount}")
# # if gpdm == '':
# # continue
# # else:
# # pass
# # df_all['股票代码'][j]=gpdm
# # else:
# # continue
# # if (i % 10 == 0):
# # df_all.to_excel(r'./data/23年500强企业新上榜_ret22.xlsx', sheet_name='Sheet1', index=False, header=True)
# # df_all.to_excel(r'./data/23年500强企业新榜_ret22.xlsx', sheet_name='Sheet1', index=False, header=True)
# # 释放资源
baseCore
.
close
()
baseCore
.
close
()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论