Python常用功能函数系列总结(二)
2021-04-26 16:28
标签:词语 自定义 obj dataframe counter ini href desc rds 方式一:jieba分词+停用词+自定义词典 方式二:jieba分词+信息熵合并 经验分享:若有好的词典和停用词,优先选用方式一,否则选择方式二。 经验分享:注意输入格式为excel文件,这也是我学习生活中常用的处理方式,直接拿去用,非常方便 另外,在我之前的一篇博客中,我介绍了Python统计词频常用的几种方式,不同的场景可以满足你各自的需求。博客传送门:https://www.cnblogs.com/zhangyafei/p/10653977.html Python常用功能函数系列总结(二) 标签:词语 自定义 obj dataframe counter ini href desc rds 原文地址:https://www.cnblogs.com/zhangyafei/p/13251458.html常用函数二:文本分词
# -*- coding: utf-8 -*-
"""
Datetime: 2020/06/25
Author: Zhang Yafei
Description: 文本分词
输入 停用词文件路径 词典文件路径 分词文件路径 表名(可选) 列名 分词结果列名 保存文件名
输出 分词结果-文件
"""
import os
import re
import jieba
import pandas as pd
if not os.path.exists(‘res‘):
os.mkdir(‘res‘)
class TextCut(object):
def __init__(self, dictionary=None, stopwords=None, ):
self.dictionary = dictionary
self.word_list = None
if self.dictionary:
jieba.load_userdict(self.dictionary)
if stopwords:
with open(stopwords, ‘r‘, encoding=‘utf-8‘) as swf:
self.stopwords = [line.strip() for line in swf]
else:
self.stopwords = None
@staticmethod
def clean_txt(raw):
file = re.compile(r"[^0-9a-zA-Z\u4e00-\u9fa5]+")
return file.sub(‘ ‘, raw)
def cut(self, text):
sentence = self.clean_txt(text.strip().replace(‘\n‘, ‘‘))
return ‘ ‘.join([i for i in jieba.cut(sentence) if i.strip() and i not in self.stopwords and len(i) > 1])
def cut2(self, text):
sentence = self.clean_txt(text.strip().replace(‘\n‘, ‘‘))
return ‘ ‘.join([i for i in jieba.cut(sentence) if
i.strip() and i not in self.stopwords and len(i) > 1 and i in self.word_list])
def run(self, file_path, col_name, new_col_name, to_file, sheet_name=None, word_in_dict=False):
if sheet_name:
df = pd.read_excel(file_path, sheet_name=sheet_name)
else:
df = pd.read_excel(file_path)
if word_in_dict:
with open(self.dictionary, encoding=‘utf-8‘) as f:
self.word_list = [word.strip() for word in f]
df[new_col_name] = df[col_name].apply(self.cut2)
else:
df[new_col_name] = df[col_name].apply(self.cut)
df.to_excel(to_file, index=False)
print(‘######### 处理完成 ############‘)
if __name__ == "__main__":
# 1. 分词
text_cut = TextCut(stopwords=‘data/stopwords.txt‘, dictionary=‘data/word_dict.txt‘)
text_cut.run(file_path=‘data/山西政策.xlsx‘, sheet_name=‘1.21-2.20‘, col_name=‘全文‘, new_col_name=‘全文分词‘,
to_file=‘res/山西政策_分词.xlsx‘)
# -*- coding: utf-8 -*-
"""
Datetime: 2020/03/01
Author: Zhang Yafei
Description: 基于信息熵对分词结果进行合并
"""
from collections import Counter
from functools import reduce
from pandas import read_excel, DataFrame
class InfoEntropyMerge(object):
def __init__(self, data, stopwords=‘data/stopwords.txt‘):
self.data = data
self.words_freq_one = {}
self.words_freq_two = {}
self.entropy_words_dict = {}
if stopwords:
with open(stopwords, ‘r‘, encoding=‘utf-8‘) as f:
self.stopwords = {line.strip() for line in f}
else:
self.stopwords = None
def count_word_freq_one(self, save_to_file=False, word_freq_file=None):
keywords = (word for word_list in self.data for word in word_list if word)
self.words_freq_one = Counter(keywords)
if save_to_file:
words = [word for word in self.words_freq_one]
freqs = [self.words_freq_one[word] for word in words]
words_df = DataFrame(data={‘word‘: words, ‘freq‘: freqs})
words_df.sort_values(‘freq‘, ascending=False, inplace=True)
words_df.to_excel(word_freq_file, index=False)
def count_freq(self, word1, word2):
"""
统计相邻两个词出现的频率
:param word1:
:param word2:
:return:
"""
if (word1, word2) not in self.words_freq_two:
self.words_freq_two[(word1, word2)] = 1
else:
self.words_freq_two[(word1, word2)] += 1
return word2
def count_word_freq_two(self, save_to_file=False, word_freq_file=None):
"""
计算相邻两个词出现的频率
:param save_to_file:
:param word_freq_file:
:return:
"""
for word_list in self.data:
reduce(self.count_freq, word_list)
if save_to_file and word_freq_file:
words_list = [(word1, word2) for word1, word2 in self.words_freq_two]
freqs = [self.words_freq_two[w1_w2] for w1_w2 in words_list]
words_df = DataFrame(data={‘word‘: words_list, ‘freq‘: freqs})
words_df.sort_values(‘freq‘, ascending=False, inplace=True)
words_df.to_excel(word_freq_file, index=False)
@staticmethod
def is_chinese(word):
for ch in word:
if ‘\u4e00‘ 0.5 and word1 not in self.stopwords and word2 not in self.stopwords and self.is_chinese(word1) and self.is_chinese(word2):
# print(word1, word2, freq_two, freq_one_min, freq_one_max)
self.entropy_words_dict[word1+word2] = w1_w2_entropy
else:
if w1_w2_entropy > 0.5:
self.entropy_words_dict[word1+word2] = w1_w2_entropy
print(‘信息熵大于0.5的词语组合:\n‘, self.entropy_words_dict)
if save_to_file and dict_path:
with open(dict_path, mode=‘r+‘, encoding=‘utf-8‘) as f:
content = f.read()
f.seek(0, 0)
for word in self.entropy_words_dict:
f.write(word+‘\n‘)
f.write(content)
print(f‘成功将信息熵大于0.5的词语保存到了{dict_path}中‘)
def data_read(path, col_name):
df = read_excel(path)
texts = df.loc[df[col_name].notna(), col_name].str.split()
return texts
if __name__ == ‘__main__‘:
text_list = data_read(path=‘res/国家政策_分词.xlsx‘, col_name=‘全文分词‘)
info_entro = InfoEntropyMerge(data=text_list)
info_entro.count_word_freq_one()
info_entro.count_word_freq_two()
info_entro.clac_entropy(save_to_file=False, dict_path=‘data/entropy_dict.txt‘)
常用函数三:词频统计
# -*- coding: utf-8 -*-
"""
Datetime: 2020/06/25
Author: Zhang Yafei
Description: 统计词频
输入 文件名 列名 分割符
输出 词频统计结果-文件
"""
from collections import Counter
import pandas as pd
def count_word_freq(file_path, col_name, to_file, sep=‘; ‘, multi_table=False):
"""
统计词频
:param file_path: 读取文件路径
:param col_name: 统计词频所在列名
:param to_file: 保存文件路径
:param sep: 词语分割符
:param multi_table: 是否读取多张表
:return:
"""
if multi_table:
datas = pd.read_excel(file_path, header=None, sheet_name=None)
with pd.ExcelWriter(path=to_file) as writer:
for sheet_name in datas:
df = datas[sheet_name]
keywords = (word for word_list in df.loc[df[col_name].notna(), col_name].str.split(sep) for word in word_list if word)
words_freq = Counter(keywords)
words = [word for word in words_freq]
freqs = [words_freq[word] for word in words]
words_df = pd.DataFrame(data={‘word‘: words, ‘freq‘: freqs})
words_df.sort_values(‘freq‘, ascending=False, inplace=True)
words_df.to_excel(excel_writer=writer, sheet_name=sheet_name, index=False)
writer.save()
else:
df = pd.read_excel(file_path)
keywords = (word for word_list in df.loc[df[col_name].notna(), col_name].str.split() for word in word_list if word)
words_freq = Counter(keywords)
words = [word for word in words_freq]
freqs = [words_freq[word] for word in words]
words_df = pd.DataFrame(data={‘word‘: words, ‘freq‘: freqs})
words_df.sort_values(‘freq‘, ascending=False, inplace=True)
words_df.to_excel(to_file, index=False)
if __name__ == ‘__main__‘:
# 对data.xlsx所有表中的keyword列统计词频,以默认‘; ‘为分割符切割词语,统计该列分词后的词频,结果保存至res.xlsx中
count_word_freq(file_path=‘data.xlsx‘, col_name=‘keyword‘, to_file=‘res.xlsx‘, multi_table=True)
上一篇:数组的动态和
下一篇:Python学习的第一次总结
文章标题:Python常用功能函数系列总结(二)
文章链接:http://soscw.com/index.php/essay/79851.html