# -*- coding: utf-8 -*-
# @Time : 2023/2/25 10:51
# @Author : ctt
# @File : 文本内容提取
# @Project : untitled1
import re
from docx import Document
import pandas as pd


class Other_Extract:
    def __init__(self):
        self.unitName_pattern = re.compile(r'(?<=部门（单位）名称：).*?(\n)')
        self.unitChargePeople_pattern = re.compile(r'(?<=单位负责人：).*?(\n)')
        self.financeChargePeople_pattern = re.compile(r'(?<=财务负责人：).*?(\n)')
        self.filledPeople_pattern = re.compile(r'(?<=编制人：).*?(\n)')
        self.year_pattern = re.compile(r'(?<=报送日期：).*?(\n)')

    @staticmethod
    def match(pattern, text):
        pattern_group = pattern.search(text)
        if pattern_group:
            return pattern_group.group().strip()
        return None

    def extract_other_result(self, text):
        unitName = self.match(self.unitName_pattern, text)
        unitChargePeople = self.match(self.unitChargePeople_pattern, text)
        financeChargePeople = self.match(self.financeChargePeople_pattern, text)
        filledPeople = self.match(self.filledPeople_pattern, text)
        year = self.match(self.year_pattern, text)
        return {'unitName': unitName,
                'unitChargePeople': unitChargePeople,
                'financeChargePeople': financeChargePeople,
                'filledPeople': filledPeople,
                'year': year}


class Extract:
    # {“主要职能”：””, “机构情况”：””, “人员情况”：””, “当年取得的主要事业成效”}
    def __init__(self):
        # self.main_functions = re.compile(r'(?<=[0-9][\.．]主要职能[。\n])(.|\n)*?(?=[0-9][\.．]机构情况[。\n])')
        self.main_functions = re.compile(r'(?<=[0-9][\.．]主要职能[。\n])(.|\n)*?(?=（[一二三四五六七八九十]）当年取得的主要事业成效[。\n])')
        # self.institutional_situation = re.compile(r'(?<=[0-9][\.．]机构情况[。\n])(.|\n)*?(?=[0-9][\.．]人员情况[。\n])')
        # self.personnel_situation = re.compile(r'(?<=[0-9][\.．]人员情况[。\n])(.|\n)*?(?=（[一二三四五六七八九十]）当年取得的主要事业成效[。\n])')
        self.business_results = re.compile(r'(?<=（[一二三四五六七八九十]）当年取得的主要事业成效[。\n])(.|\n)*?(?=[一二三四五六七八九十]、收入支出预算执行情况分析)')
        # self.patterns = [self.main_functions, self.institutional_situation, self.personnel_situation, self.business_results]

    @staticmethod
    def match(pattern, text):
        pattern_group = pattern.search(text)
        if pattern_group:
            return pattern_group.group().strip()
        return None

    def extract_result(self, text):
        main_functions = self.match(self.main_functions, text)
        # institutional_situation = self.match(self.institutional_situation, text)
        # personnel_situation = self.match(self.personnel_situation, text)
        business_results = self.match(self.business_results, text)
        return {'主要职能': main_functions,
                # '机构情况': institutional_situation,
                # '人员情况': personnel_situation,
                '当年取得的主要事业成效': business_results}


def get_text_from_docx(filepath):
        '''
        获取word文档的所有文本内容
        :param filepath:
        :return:
        '''
        document = Document(filepath)
        contents = []
        for paragraph in document.paragraphs:
            if '<w:numPr>' in paragraph._element.xml:
                contents.append('1.'+paragraph.text)
                contents.append('\n')
            else:
                contents.append(paragraph.text)
                contents.append('\n')
        return ''.join(contents)


def get_cover_content_from_docx(filepath):
    '''
    获取word文档的所有文本内容
    :param filepath:
    :return:
    '''

    document = Document(filepath)
    contents = []
    # 第一步遍历段落存储信息
    for paragraph in document.paragraphs:
        if '<w:numPr>' in paragraph._element.xml:
            contents.append('1.' + paragraph.text)
            contents.append('\n')
        else:
            contents.append(paragraph.text)
            contents.append('\n')
    # 第二步取前15段获取封面标题信息
    target_content = []
    for content in contents[:14]:
        if content.replace("\xa0", "").strip():
            target_content.append(content.strip())
    # print(contents[14:35])
    # 第三步取15段获取其它信息
    other_content = []
    for temp_content in contents[14:35]:
        if temp_content.replace("\xa0", "").strip():
            other_content.append(temp_content.strip())
            other_content.append('\n')

    return "".join(target_content), ''.join(other_content)


if __name__ == '__main__':
    new_path = "data/2022年度安岳县元坝镇人民政府部门决算分析报告(1).docx"
    document = get_text_from_docx(new_path)
    data = Extract().extract_result(document)
    print(data)
    # fifth_area_pattern = re.compile(r'(?<=[0-9][\.．]会计报表重要项目的明细信息及说明[。\n])(.|\n)*?(?=[0-9][\.．]需要说明的其他事项[。\n])')
    # filepath = "wKjIbGQeSb6AUq1aAAgAABcLaMw312.docx"
    # document = Document(filepath)
    # documents = get_text_from_docx(filepath)
    #
    # area_group = fifth_area_pattern.search(documents)
    # if area_group:
    #     area_text = area_group.group().strip("1.").strip()
    # else:
    #     area_text = ""
    #
    # print(area_text)
    # cover_contents, other_contents = get_cover_content_from_docx(filepath)
    # cover_pattern = re.compile(r"([0-9]{0,4}).*(?=(财务报告))")
    #
    # # print(content)
    # cover_group = cover_pattern.search(cover_contents)
    # if cover_group:
    #     cover_text = cover_group.group().strip()
    # else:
    #     cover_text = ""
    #
    # other_extract = Other_Extract()
    # other_data = other_extract.extract_other_result(other_contents)
    # other_data["reportTitle"] = cover_text
    # print(other_data)



    # extract = Extract()
    # # path = r'D:\四川报告\相关代码\四川报告之文本内容提取\data'
    # path = "data/temp.docx"
    # result = extract.extract_result(path)
    # print(result)
    # for file in os.listdir(path):
    #     if file[-4:] == 'docx':
    #         filepath = os.path.join(path, file)
    #         paras = get_text_from_docx(filepath)
    #         print(paras)
    #         result = extract.extract_result(paras)
    #         print(result)