import pandas as pd
import requests
import urllib3
from pyquery import PyQuery as pq
urllib3.disable_warnings()

def getHtml(url):
    try:
        # proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
        response = requests.get(url,verify=False,timeout=10)
        html=response.text
    except Exception as e:
        html=''
    return html

def getList():
    for i in range(17,139):
        print(f'============开始采集第{i}页=========')
        url=f'https://www.whitehouse.gov/?s=Russia&paged={i}'
        html=getHtml(url)
        if html:
            pass
        else:
            for nnn in range(0, 3):
                html = getHtml(url)
                if html:
                    break
                else:
                    continue
        try:
            doc=pq(html)
        except:
            return
        ac=doc('div[class="col-md-10 col-lg-6"]>article')
        for ii in range(0,len(ac)):
            doc=pq(ac[ii])
            title=doc('h2[class="entry-title"]>a').text()
            url=doc('h2[class="entry-title"]>a').attr('href')
            summary=doc('div[class="post-content"]').text()
            try:
                time=doc('time').attr('datetime').split('T')[0]
            except:
                time = ''
            type=doc('span[class="tax-links cat-links"]>a').text()
            detail={
                'title':title,
                'url':url,
                'time':time,
                'type':type,
                'summary':summary,
            }
            getDetail(detail)
            print(f'=============第{i}页===第{ii}条采集完成==========')

def getDetail(detail):
    detailList=[]
    url=detail['url']
    html=getHtml(url)
    if html:
        pass
    else:
        for nnn in range(0,3):
            html = getHtml(url)
            if html:
                break
            else:
                continue
    try:
        doc=pq(html)
    except:
        return
    content=doc('section[class="body-content"]').text()
    detail['content'] = content
    detailList.append(detail)
    writerToExcel(detailList)
    return content

# 将数据追加到excel
def writerToExcel(detailList):
    # filename='baidu搜索.xlsx'
    # 读取已存在的xlsx文件
    existing_data = pd.read_excel(filename)
    # 创建新的数据
    new_data = pd.DataFrame(data=detailList)
    # 将新数据添加到现有数据的末尾
    combined_data = existing_data.append(new_data, ignore_index=True)
    # 将结果写入到xlsx文件
    combined_data.to_excel(filename, index=False)

from openpyxl import Workbook
if __name__ == '__main__':
    # # 创建一个工作簿
    filename='./cis.xlsx'
    workbook = Workbook()
    workbook.save(filename)
    getList()