
先安装依赖
pip install python-docx
段落内容和表格内容分开读取
import docx
from docx.document import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
def read_word_document(file_path: str) -> None:
"""读取 Word 文档并输出其内容"""
try:
file_path = "words/222.docx"
# 打开文档
doc: Document = docx.Document(file_path)
# 读取所有段落
print("文档段落内容:")
for i, paragraph in enumerate(doc.paragraphs, 1):
if paragraph.text.strip(): # 跳过空段落
print(f"段落 {i}: {paragraph.text}")
# 读取所有表格
print("\n文档表格内容:")
for table_idx, table in enumerate(doc.tables, 1):
print(f"\n表格 {table_idx}:")
for row_idx, row in enumerate(table.rows):
row_data = [cell.text for cell in row.cells]
print(f"行 {row_idx + 1}: {row_data}")
except FileNotFoundError:
print(f"错误:找不到文件 '{file_path}'")
except Exception as e:
print(f"错误:读取文件时发生异常 - {e}")
if __name__ == "__main__":
# 请替换为实际的 Word 文档路径
file_path = "example.docx"
read_word_document(file_path)
一次性读取完
import docx
from docx.document import Document
from docx.oxml.ns import qn
def read_word_document(file_path: str) -> None:
"""读取 Word 文档并按顺序输出所有内容"""
try:
doc: Document = docx.Document(file_path)
content = []
# 遍历文档中的所有内容块(段落和表格)
for element in doc.element.body:
# 判断元素类型
if element.tag.endswith('p'): # 段落
paragraph = docx.text.paragraph.Paragraph(element, doc)
if paragraph.text.strip():
content.append(f"[段落] {paragraph.text}")
elif element.tag.endswith('tbl'): # 表格
table = docx.table.Table(element, doc)
table_content = []
for row in table.rows:
row_data = [cell.text for cell in row.cells]
table_content.append(" | ".join(row_data))
content.append(f"[表格]\n" + "\n".join(table_content))
# 输出所有内容
print("\n".join(content))
except FileNotFoundError:
print(f"错误:找不到文件 '{file_path}'")
except Exception as e:
print(f"错误:读取文件时发生异常 - {e}")
if __name__ == "__main__":
# 使用相对路径指定文件位置
file_path = "words/222.docx"
read_word_document(file_path)
读取网络中的word内容
import docx
import requests
from io import BytesIO
def read_word_document_from_url(url: str) -> None:
"""从 URL 读取 Word 文档并按顺序输出所有内容"""
try:
# 发送 HTTP 请求获取文档内容
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
# 将响应内容转换为二进制流
doc_bytes = BytesIO(response.content)
# 打开文档
doc = docx.Document(doc_bytes)
content = []
# 遍历文档中的所有内容块(段落和表格)
for element in doc.element.body:
if element.tag.endswith('p'): # 段落
paragraph = docx.text.paragraph.Paragraph(element, doc)
if paragraph.text.strip():
content.append(f"[段落] {paragraph.text}")
elif element.tag.endswith('tbl'): # 表格
table = docx.table.Table(element, doc)
table_content = []
for row in table.rows:
row_data = [cell.text for cell in row.cells]
table_content.append(" | ".join(row_data))
content.append(f"[表格]\n" + "\n".join(table_content))
# 输出所有内容
print("\n".join(content))
except requests.exceptions.RequestException as e:
print(f"HTTP 请求错误: {e}")
except Exception as e:
print(f"错误: 处理文档时发生异常 - {e}")
if __name__ == "__main__":
# 指定网络 Word 文档的 URL
url = "http://watertapcollection.cqzuxia.com/ImportTemplate/222.docx"
read_word_document_from_url(url)
读取word报错: 错误:读取文件时发生异常 - module ‘docx’ has no attribute ‘table’
直接读取文件的版本
import docx
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.text.paragraph import Paragraph
import argparse
def read_word_document(file_path: str) -> None:
"""读取Word文档并按顺序输出所有内容"""
try:
# 打开本地文档
doc = docx.Document(file_path)
content = []
# 遍历文档中的所有内容块(段落和表格)
for element in doc.element.body:
if isinstance(element, CT_P): # 段落
paragraph = Paragraph(element, doc)
if paragraph.text.strip():
content.append(f"[段落] {paragraph.text}")
elif isinstance(element, CT_Tbl): # 表格
table_content = []
for row in element.tr_lst:
row_data = []
for cell in row.tc_lst:
cell_text = ""
for paragraph in cell.p_lst:
p = Paragraph(paragraph, doc)
cell_text += p.text
row_data.append(cell_text)
table_content.append(" | ".join(row_data))
content.append(f"[表格]\n" + "\n".join(table_content))
# 输出所有内容
print("\n".join(content))
except FileNotFoundError:
print(f"错误:找不到文件 '{file_path}'")
except Exception as e:
print(f"错误:处理文档时发生异常 - {e}")
if __name__ == "__main__":
# 设置命令行参数解析
parser = argparse.ArgumentParser(description='读取Word文档内容')
parser.add_argument('--file', default='words/output.docx',
help='Word文档路径(默认:words/output.docx)')
args = parser.parse_args()
# 读取指定路径的Word文档
read_word_document(args.file)
读取网络中的地址版本
import docx
import requests
from io import BytesIO
def read_word_document_from_url(url: str) -> None:
"""从 URL 读取 Word 文档并按顺序输出所有内容"""
try:
# 发送 HTTP 请求获取文档内容
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
# 将响应内容转换为二进制流
doc_bytes = BytesIO(response.content)
# 打开文档
doc = docx.Document(doc_bytes)
content = []
# 遍历文档中的所有内容块(段落和表格)
for element in doc.element.body:
if element.tag.endswith('p'): # 段落
# 正确获取段落对象的方式
paragraph = docx.text.paragraph.Paragraph(element, doc)
if paragraph.text.strip():
content.append(f"[段落] {paragraph.text}")
elif element.tag.endswith('tbl'): # 表格
# 正确获取表格对象的方式
table = docx.oxml.table.CT_Tbl(element)
table_content = []
for row in table.tr_lst:
row_data = []
for cell in row.tc_lst:
cell_text = "".join([
p.text for p in docx.text.paragraph.Paragraph(
cell.p_lst[0], doc
).runs
])
row_data.append(cell_text)
table_content.append(" | ".join(row_data))
content.append(f"[表格]\n" + "\n".join(table_content))
# 输出所有内容
print("\n".join(content))
except requests.exceptions.RequestException as e:
print(f"HTTP 请求错误: {e}")
except Exception as e:
print(f"错误: 处理文档时发生异常 - {e}")
if __name__ == "__main__":
# 指定网络 Word 文档的 URL
url = "http://watertapcollection.xj.com/ImportTemplate/222.docx"
url = "http://watertapcollection.xj.com/ImportTemplate/111.doc"
read_word_document_from_url(url)
欢迎加群讨论技术,1群:677373950(满了,可以加,但通过不了),2群:656732739。有需要软件开发,或者学习软件技术的朋友可以和我联系~(Q:815170684)
评价
排名
8
文章
243
粉丝
7
评论
7
ICP备案 :渝ICP备18016597号-1
网站信息:2018-2025TNBLOG.NET
技术交流:群号656732739
联系我们:contact@tnblog.net
公网安备:
50010702506256


欢迎加群交流技术