# -*- coding: utf-8 -*-
# @Time : 2023/2/17 11:57
# @Author : ctt
# @File : extract_table
# @Project : 从word中提取指定表格
import re
import json
import pandas as pd
from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table, _Row
from docx.text.paragraph import Paragraph


def iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.
    """
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    elif isinstance(parent, _Row):
        parent_elm = parent._tr
    else:
        raise ValueError("something's not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)


def parase_table(table):
    # 先定义结果dataframe
    out_df = pd.DataFrame()
    for i, row in enumerate(table.rows[:]):
        row_content = []
        for cell in row.cells:
            c = cell.text.strip()
            # print(c)
            row_content.append(c)
        out_df = pd.concat([out_df, pd.DataFrame(row_content)], axis=1, ignore_index=True)
    return out_df.T


def get_choose_table(document, table_names: list):
    table_names_rule = '|'.join(table_names)
    table_names_data = {}
    [table_names_data.update({key: ''}) for key in table_names]
    # {'资产负债表': '', '收入费用表（1）': '', '收入费用表（2）': '',}
    dw_pattern = re.compile(r''+table_names_rule)
    for block in iter_block_items(document):
        # 处理段落
        if isinstance(block, Paragraph):
            dw = dw_pattern.findall(block.text)
        # 通过字符串匹配找到目标表格位置，并解析相应的内容
        elif isinstance(block, Table) and dw:
            # dw[0]为“资产负债表”
            table_df = parase_table(block.table)
            if "编制单位" in table_names_data:
                pass
            else:
                table_names_data.update({'时间': table_df.iloc[0, -2]})
                table_names_data.update({'单位': table_df.iloc[0, -1].replace(":", "：").split("：")[1]})
                table_names_data.update({'编制单位': table_df.iloc[0, 0].replace(":", "：").split("：")[1]})
            # 表头为“编制单位：德阳市旌阳区发展和改革局  2021年12月31日  2021年12月31日     单位：万元”，待删除
            table_df.drop([0], inplace=True)
            # 待将列字段 "0     1         2         3" 转换成 “项目	附注	年末数	年初数”
            # print(table_df.iloc[0])         # 为“项目	附注	年末数	年初数”
            table_df.rename(columns=table_df.iloc[0], inplace=True)
            # 去掉当前表头即  “项目	附注	年末数	年初数”，因为rename后会有一行重复的内容，
            table_df.drop(table_df.index[0], inplace=True)
            # print(table_df)
            table_names_data[dw[0]] = table_df
        # 处理有目标信息的表格（表头包含目标信息）
        elif isinstance(block, Table):
            table_df = parase_table(block.table)
            if table_df[0][0] in table_names:
                update_table_key = table_df[0][0]
                # 去掉表的头两行
                table_df.drop([0, 1], inplace=True)
                # 更新表头，即列名
                table_df.rename(columns=table_df.iloc[0], inplace=True)
                # 去掉第一行的值
                table_df.drop(table_df.index[0], inplace=True)
                # 拼接原表和续表
                concated_df = pd.concat([table_names_data[update_table_key], table_df], ignore_index=True)
                # 更新表名对应的value
                table_names_data.update({update_table_key: concated_df})
    # print(table_names_data)
    # 将df内容格式转换为JSON格式
    for table_key, table_value in table_names_data.items():
        if isinstance(table_value, pd.DataFrame):
            table_names_data.update({table_key: json.loads(table_value.to_json(orient='records', force_ascii=False))})
    return table_names_data


def get_other_table(document, table_names: list):
    table_names_rule = '|'.join(table_names)
    table_names_data = {}
    [table_names_data.update({key: ''}) for key in table_names]
    # {'（2）以名义金额计量的资产名称、数量等情况，以及以名义金额计量理由的说明。': ''}
    dw_pattern = re.compile(r''+table_names_rule)
    for block in iter_block_items(document):
        # 处理段落
        if isinstance(block, Paragraph):
            dw = dw_pattern.findall(block.text)
        # 通过字符串匹配找到目标表格位置，并解析相应的内容
        elif isinstance(block, Table) and dw:
            # dw[0]为“资产负债表”
            table_df = parase_table(block.table)
            table_df.drop([0, 1], inplace=True)
            # 选择目标df，注意这里是将其复制一份数据，若直接修改会引起警告
            select_df = table_df.iloc[:, [0, 3]].copy()
            select_df.rename(columns=select_df.iloc[0], inplace=True)
            select_df.drop(table_df.index[0], inplace=True)
            table_names_data[dw[0]] = select_df
    # print(table_names_data)
    # 将df内容格式转换为JSON格式
    for table_key, table_value in table_names_data.items():
        if isinstance(table_value, pd.DataFrame):
            table_names_data.update({table_key: json.loads(table_value.to_json(orient='records', force_ascii=False))})
    return table_names_data


def get_other1_table(document, table_names: list):
    table_names_rule = '|'.join(table_names)
    table_names_data = {}
    [table_names_data.update({key: ''}) for key in table_names]
    # [{'货币资金明细信息如下'}]
    dw_pattern = re.compile(r'' + table_names_rule)
    for block in iter_block_items(document):
        # 处理段落
        if isinstance(block, Paragraph):
            dw = dw_pattern.findall(block.text)
        # 通过字符串匹配找到目标表格位置，并解析相应的内容
        elif isinstance(block, Table) and dw:
            # dw[0]为“资产负债表”
            table_df = parase_table(block.table)
            table_df.drop([0, 1, 2], inplace=True)
            # print(table_df)
            # 选择目标df，注意这里是将其复制一份数据，若直接修改会引起警告
            # select_df = table_df.iloc[:, [0, 3]].copy()
            select_df = table_df.copy()
            select_df.rename(columns=select_df.iloc[0], inplace=True)
            # print(select_df)
            select_df.drop(table_df.index[0], inplace=True)
            table_names_data[dw[0]] = select_df
    # print(table_names_data)
    # 将df内容格式转换为JSON格式
    for table_key, table_value in table_names_data.items():
        if isinstance(table_value, pd.DataFrame):
            table_names_data.update({table_key: json.loads(table_value.to_json(orient='records', force_ascii=False))})
    return table_names_data


if __name__ == '__main__':
    docx_file = r'data/3月23测试半成品.docx'
    document = Document(docx_file)
    table_names = ['货币资金明细信息如下']
    print(get_other1_table(document, table_names))

    # import datetime
    # start_time = datetime.datetime.now()
    # docx_file = r'data/四川报告模板.docx'
    # document = Document(docx_file)
    # data = get_choose_table(document, ['资产负债表', '收入费用表（1）', '收入费用表（2）'])
    # # 处理资产负债表
    # temp_list = data["资产负债表"]
    # temp_dict = {}
    #
    # for temp in temp_list:
    #     temp_text = re.sub(":", "：", temp["项目"])
    #     if temp_text.endswith("："):
    #         temp_dict.update({"temp_key": temp_text})
    #         continue
    #     else:
    #         temp["上级项目"] = temp_dict["temp_key"].strip("：")
    #
    #
    # # 处理收入费用表（1）
    # temp_list_0 = data["收入费用表（1）"]
    # temp_dict_0 = {"temp_key": "收入合计"}
    # # updata_list = ["收入合计", "本年盈余"]
    # for temp_0 in temp_list_0:
    #     if temp_0["项目"].strip() == "收入合计":
    #         temp_dict_0.update({"temp_key": "本年盈余"})
    #     else:
    #         if temp_0["项目"].strip() == "本年盈余":
    #             continue
    #         else:
    #             temp_0["上级项目"] = temp_dict_0["temp_key"]
    #
    # # 处理收入费用表（2）
    # temp_list_1 = data["收入费用表（2）"]
    # temp_dict_1 = {"temp_key": "收入合计"}
    # # updata_list = ["收入合计", "本年盈余"]
    # for temp_1 in temp_list_1:
    #     if temp_1["项目"].strip() == "收入合计":
    #         temp_dict_1.update({"temp_key": "本年盈余"})
    #     else:
    #         if temp_1["项目"].strip() == "本年盈余":
    #             continue
    #         else:
    #             temp_1["上级项目"] = temp_dict_1["temp_key"]
    # print(data)
    # end_time = datetime.datetime.now()
    # print(start_time)
    # print(end_time)
    # print("耗时: {}秒".format(end_time - start_time))





