import time

import requests
from bs4 import BeautifulSoup
from retry import retry

from base import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'X-Requested-With': 'XMLHttpRequest',
}


@retry(tries=3,delay=10)
def getPageSize():
    ip = baseCore.get_proxy()
    url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
    data_post = {
        'websiteid': '330000000000000',
        'pg': '10',
        'p': '1',
        'tpl': '1569',
        'cateid': '372',
        'word': 'REITs',
        'checkError': '1',
        'isContains': '1',
        'q': 'REITs',
        'pos': 'content,filenumber',
        'sortType': '1',
    }
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    total = req.json()['total']
    if total % 10 == 0:
        pageSize = total // 10
    else:
        pageSize = total // 10 + 1
    req.close()
    return pageSize

@retry(tries=3,delay=10)
def getDataJson(page):
    ip = baseCore.get_proxy()
    url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
    data_post = {
        'websiteid': '330000000000000',
        'pg': '10',
        'p': f'{page}',
        'tpl': '1569',
        'cateid': '372',
        'word': 'REITs',
        'checkError': '1',
        'isContains': '1',
        'q': 'REITs',
        'pos': 'content,filenumber',
        'sortType': '1',
    }
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    data_json = req.json()['result']
    return data_json

def getDatas(page):
    data_json = getDataJson(page)
    for data_ in data_json:
        soup = BeautifulSoup(data_, 'lxml')
        title = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').text.lstrip().strip().replace(' ','').replace('\r\n',' ')
        href = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').get('href')
        href = href.split('url=')[1].split('.html')[0].replace('%3A',':').replace('%2F','/') + '.html'
        try:
            info = soup.find('table', class_='fgwj_table_list').text
            organ = info.split('发布机构：')[1].split('成文日期：')[0].lstrip().strip()
            writtenDate = info.split('成文日期：')[1].lstrip().strip()
        except:
            organ = ''
            writtenDate = None
        origin = soup.find('div', class_='sourceTime').text.split('来源:')[1].split('时间:')[0].lstrip().strip().replace(' ','').replace(' ', '').replace('\r\n', '')
        publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip()
        log.info(origin)
        time.sleep(5)


def doJob():
    pageSize = getPageSize()
    for page in range(1, pageSize + 1):
        datas = getDatas(page)


if __name__ == '__main__':
    doJob()
    # url = 'http%3A%2F%2Fwww.zj.gov.cn%2Fart%2F2022%2F4%2F18%2Fart_1229630461_2401403.html'
    # req = requests.get(url,headers=headers)
    # req.encoding = req.apparent_encoding
    baseCore.close()
