# -*- coding: utf-8 -*-
import datetime
import time

import pymysql
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pyquery import PyQuery as pq
from openpyxl import Workbook
import pandas as pd

class WanfangSpider(object):
    def __init__(self):
        pass

    def req(self,url):
        header={
            "accept":"*/*",
            "connection":"Keep-Alive",
            "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
        }
        res = requests.get(url,headers=header)
        if res.status_code==200:
            text=res.text
            print('请求成功！')
        else:
            text=''
            print('请求失败！')
        return text
    # 将html中的相对地址转换成绝对地址
    def paserUrl(self,html,listurl):
        soup = BeautifulSoup(html, 'html.parser')
        # 获取所有的<a>标签和<img>标签
        links = soup.find_all(['a', 'img'])
        # 遍历标签，将相对地址转换为绝对地址
        for link in links:
            if 'href' in link.attrs:
                link['href'] = urljoin(listurl, link['href'])
            elif 'src' in link.attrs:
                link['src'] = urljoin(listurl, link['src'])
        return soup
    def pageList(self,start,end):
        listmsg=[]
        # for num in range(1,1321):
        for num in range(start,end):
            url=f'https://kms.wanfangdata.com.cn/IndustryYJ/Search/Cecdb?q=%E5%86%B6%E9%87%91%2B%E5%86%B6%E7%82%BC&PageNumber={num}'
            html=self.req(url)
            soup=self.paserUrl(html,url)
            text=str(soup.prettify())
            doc=pq(text)
            liTag=doc('li[class="rt-wrap"]')
            # print(liTag)
            for li in liTag:
                lidoc=pq(li)
                title=lidoc('a[class="title"]').text()
                turl=lidoc('a[class="title"]').attr('href')
                msg={
                    'title':title,
                    'turl':turl
                }
                print(f'title:{title}  url:{url}')
                listmsg.append(msg)
        return listmsg

    def detailMsg(self,msg):
        detailList=[]
        turl = msg['turl']
        title = msg['title']
        html=self.req(turl)
        soup=self.paserUrl(html,turl)
        dtext=str(soup.prettify())
        ddoc=pq(dtext)
        a1=ddoc('table[class="detail-md"]>tr:nth-child(2)>td:nth-child(1)').text().replace(":","")
        institutionType=ddoc('table[class="detail-md"]>tr:nth-child(2)>td:nth-child(2)').text()
        a2=ddoc('table[class="detail-md"]>tr:nth-child(3)>td:nth-child(1)').text().replace(":","")
        formerName=ddoc('table[class="detail-md"]>tr:nth-child(3)>td:nth-child(2)').text()
        a3=ddoc('table[class="detail-md"]>tr:nth-child(4)>td:nth-child(1)').text().replace(":","")
        leader=ddoc('table[class="detail-md"]>tr:nth-child(4)>td:nth-child(2)').text()
        a4=ddoc('table[class="detail-md"]>tr:nth-child(5)>td:nth-child(1)').text().replace(":","")
        establishmentDate=ddoc('table[class="detail-md"]>tr:nth-child(5)>td:nth-child(2)').text()
        a5=ddoc('table[class="detail-md"]>tr:nth-child(6)>td:nth-child(1)').text().replace(":","")
        introduction=ddoc('table[class="detail-md"]>tr:nth-child(6)>td:nth-child(2)').text()
        a6=ddoc('table[class="detail-md"]>tr:nth-child(7)>td:nth-child(1)').text().replace(":","")
        classification=ddoc('table[class="detail-md"]>tr:nth-child(7)>td:nth-child(2)').text()
        a7=ddoc('table[class="detail-md"]>tr:nth-child(8)>td:nth-child(1)').text().replace(":","")
        keywords=ddoc('table[class="detail-md"]>tr:nth-child(8)>td:nth-child(2)').text()
        a8=ddoc('table[class="detail-md"]>tr:nth-child(9)>td:nth-child(1)').text().replace(":","")
        researchEquipment=ddoc('table[class="detail-md"]>tr:nth-child(9)>td:nth-child(2)').text()
        a9=ddoc('table[class="detail-md"]>tr:nth-child(10)>td:nth-child(1)').text().replace(":","")
        researchAreas=ddoc('table[class="detail-md"]>tr:nth-child(10)>td:nth-child(2)').text()
        a10=ddoc('table[class="detail-md"]>tr:nth-child(11)>td:nth-child(1)').text().replace(":","")
        awards=ddoc('table[class="detail-md"]>tr:nth-child(11)>td:nth-child(2)').text()
        a11=ddoc('table[class="detail-md"]>tr:nth-child(12)>td:nth-child(1)').text().replace(":","")
        internalDepartments=ddoc('table[class="detail-md"]>tr:nth-child(12)>td:nth-child(2)').text()
        a12=ddoc('table[class="detail-md"]>tr:nth-child(13)>td:nth-child(1)').text().replace(":","")
        subsidiaryInstitutions=ddoc('table[class="detail-md"]>tr:nth-child(13)>td:nth-child(2)').text()
        a13=ddoc('table[class="detail-md"]>tr:nth-child(14)>td:nth-child(1)').text().replace(":","")
        productInformation=ddoc('table[class="detail-md"]>tr:nth-child(14)>td:nth-child(2)').text()
        a14=ddoc('table[class="detail-md"]>tr:nth-child(15)>td:nth-child(1)').text().replace(":","")
        publicationJournals=ddoc('table[class="detail-md"]>tr:nth-child(15)>td:nth-child(2)').text()
        a15=ddoc('table[class="detail-md"]>tr:nth-child(16)>td:nth-child(1)').text().replace(":","")
        mailingAddress=ddoc('table[class="detail-md"]>tr:nth-child(16)>td:nth-child(2)').text()
        a16=ddoc('table[class="detail-md"]>tr:nth-child(17)>td:nth-child(1)').text().replace(":","")
        tel=ddoc('table[class="detail-md"]>tr:nth-child(17)>td:nth-child(2)').text()
        a17=ddoc('table[class="detail-md"]>tr:nth-child(18)>td:nth-child(1)').text().replace(":","")
        faxNumber=ddoc('table[class="detail-md"]>tr:nth-child(18)>td:nth-child(2)').text()
        a18=ddoc('table[class="detail-md"]>tr:nth-child(19)>td:nth-child(1)').text().replace(":","")
        email=ddoc('table[class="detail-md"]>tr:nth-child(19)>td:nth-child(2)').text()
        a19=ddoc('table[class="detail-md"]>tr:nth-child(20)>td:nth-child(1)').text().replace(":","")
        website=ddoc('table[class="detail-md"]>tr:nth-child(20)>td:nth-child(2)').text()
        a20=ddoc('table[class="detail-md"]>tr:nth-child(21)>td:nth-child(1)').text().replace(":","")
        web=ddoc('table[class="detail-md"]>tr:nth-child(21)>td:nth-child(2)').text()
        detailmsg={
            'title':title,
            'turl':turl,
            a1:institutionType,
            a2:formerName,
            a3:leader,
            a4:establishmentDate,
            a5:introduction,
            a6:classification,
            a7:keywords,
            a8:researchEquipment,
            a9:researchAreas,
            a10:awards,
            a11:internalDepartments,
            a12:subsidiaryInstitutions,
            a13:productInformation,
            a14:publicationJournals,
            a15:mailingAddress,
            a16:tel,
            a17:faxNumber,
            a18:email,
            a19:website,
            a20:web
        }
        detailList.append(detailmsg)
        self.writerToExcel(detailList)

    def conn144(self):
        conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
                               charset='utf8')
        cursor = conn.cursor()
        return conn,cursor

    def dataToSql(self,detailmsg):
        conn,cursor=self.conn144()
        try:
            # 检查记录是否存在
            # 获取当前时间
            current_time = datetime.datetime.now()
            # 将时间转换为字符串
            currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")

        except Exception as e:
            print('+++++')
        finally:
            cursor.close()
            conn.close()
    # 将数据追加到excel
    def writerToExcel(self,detailList):
        # filename='baidu搜索.xlsx'
        # 读取已存在的xlsx文件
        existing_data = pd.read_excel(filename)
        # 创建新的数据
        new_data = pd.DataFrame(data=detailList)
        # 将新数据添加到现有数据的末尾
        combined_data = existing_data.append(new_data, ignore_index=True)
        # 将结果写入到xlsx文件
        combined_data.to_excel(filename, index=False)
        print('保存成功！！')

if __name__ == '__main__':
    wanfang=WanfangSpider()
    for num in range(801,1321,100):
        filename=f'企业_{num}.xlsx'
        # # 创建一个工作簿
        workbook = Workbook()
        workbook.save(filename)
        start=num
        end=num+100
        lsitmsg=wanfang.pageList(start,end)
        for msg in lsitmsg:
            wanfang.detailMsg(msg)