Excel文件数据抽取

本文仅仅是对Excel中的数据进行了简单的读取抽出并统一输出到指定的地方,在读取老版本xls文件的时候可能会出现编码的问题导致文件读取失败:

1
UnicodeDecodeError: 'utf-16-le' codec can't decode byte 0x40 in position 104: truncated data

可以通过对报错的源码unpack_unicode代码进行try-except并将其赋值为空。

1
2
3
4
try: 
strg = unpack_unicode(data, 0, lenlen=2)
except:
strg = ""

以下代码仅仅是遍历指定目录下的所有文件,之所以多写一重判断循环文件,是为了将第一层的文件夹名称生成一个新的sheet,第一级文件夹下的所有文件的数据都统计到当前文件夹名称所对应的sheet中,且对实际情况进行了简单的数据过滤和处理,在此仅仅是存储一个简单的模板,后续有其他的需求可在该模板上持续修改优化。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding: UTF-8 -*-
import os

import openpyxl
import pandas as pd
import xlrd


def find_value(basePath):
fileDirs = os.listdir(basePath)
dataAllValue = pd.DataFrame()
for fileDir in fileDirs:
childFile = os.path.join('%s%s%s' % (basePath, "\\", fileDir))
if os.path.isfile(childFile):
if fileDir.endswith("xls") or fileDir.endswith("xlsx") or fileDir.endswith("XLS") or fileDir.endswith("XLSX"):
colunm_count, sheetDataFrameValue = read_excel(childFile)
if sheetDataFrameValue.empty:
continue
if colunm_count == 1:
continue
dataAllValue = dataAllValue.append(sheetDataFrameValue)
if os.path.isdir(childFile):
childDataFrameValue = find_value(childFile)
dataAllValue = dataAllValue.append(childDataFrameValue)

return dataAllValue


def read_excel(childFile):
try:
FileObj = xlrd.open_workbook(childFile) # 打开处理的excel文件
sheetNames = FileObj.sheet_names()
except:
print("有问题的文件:", childFile)
return 1, pd.DataFrame()
i = 0
for sheetName in sheetNames:
sheet = FileObj.sheets()[i] # 获取第一个工作表
i += 1
row_count = sheet.nrows # 行数
colunm_count = 0
sheetDataFrameValue = pd.DataFrame()
for element in range(2, row_count):
row_values = [cell_value_clear(cell_value) for cell_value in sheet.row_values(element)]
for keyValue in keyValues:
if keyValue.lower() in row_values:
findCon = sheet.row_values(element)
not_empty = [i for i in findCon if i != '']
if len(not_empty) < 3:
continue

columnDf = pd.DataFrame(findCon)
columnDf.rename(columns={0: colunm_count}, inplace=True)
if sheetDataFrameValue.empty:
sheetDataFrameValue = columnDf.join(sheetDataFrameValue)
else:
sheetDataFrameValue = sheetDataFrameValue.join(columnDf)
colunm_count += 1
return colunm_count, sheetDataFrameValue

def cell_value_clear(cell_value):
if cell_value is None:
return ''
return str(cell_value).replace(" ", "").lower()

def write_excel_xlsx(dataFrameValue1, sheetName, excelName):
newExcel = pd.DataFrame(dataFrameValue1)
writer = pd.ExcelWriter(excelName, engine='openpyxl')
is_file_exists = os.path.exists(excelName) # 判断文件是否存在
if is_file_exists is True:
book = openpyxl.load_workbook(writer.path)
writer.book = book
newExcel.to_excel(excel_writer=writer, sheet_name=sheetName, encoding="utf-8", index=False)
writer.save()
writer.close()
else:
newExcel.to_excel(excel_writer=writer, sheet_name=sheetName, encoding="utf-8", index=False)
writer.save()
writer.close()

def run(basePath):
pathDirs = os.listdir(basePath)
for path in pathDirs:
provinceDataPath = os.path.join('%s%s%s' % (basePath, "\\", path))
print(provinceDataPath)
if os.path.isdir(provinceDataPath):
dataAllValue = find_value(provinceDataPath)
print(dataAllValue)
if dataAllValue.empty:
continue
write_excel_xlsx(dataAllValue, path, excelName)

if __name__ == '__main__':
# 文件路径
basePath = r"F:\data"
keyValues = ["指标1", "指标2", "指标3", "指标4"]
excelName = 'F:\抽取数据\数据统计总览.xlsx' # 文件保存的路径
run(basePath)