Jieba分词 - 乐耶园

Jieba

jieba 是一个中文分词第三方库，被称为最好的 Python 中文分词库。支持三种分词模式：精确模式、全模式和搜索引擎模式，并且支持繁体分词和自定义词典。使用前需要额外安装（对应安装命令改为：pip install jieba）

这里主要介绍jieba包的分词功能

测试

代码

 
# -*- coding: utf-8 -*-
# @Time : 2022/5/1 15:52
# @Author : MinChess
# @File : test.py
# @Software: PyCharm
import jieba
 
seg_list = jieba.cut("我在东北师范大学测试结巴库", cut_all=True)
print("/ ".join(seg_list))  # 全模式
 
seg_list = jieba.cut("我在东北师范大学测试结巴库", cut_all=False)
print( "/ ".join(seg_list))  # 精确模式
 
seg_list = jieba.cut("我在东北师范大学测试结巴库")  # 默认是精确模式
print("/ ".join(seg_list))
 
seg_list = jieba.cut_for_search("我就读与东北师范大学，我的专业是信息资源管理")  # 搜索引擎模式
print("/ ".join(seg_list))

输出

实现去停用词

实现思路读取自定义词表，组成数组，将分词后的内容与列表对比，相同就替换，原理和正则表达式清洗文档一样，匹配到不需要的就替换。

完整代码

:::tip

方法有很多，提供两个最常用的，其实已经内置了很多功能了，根据实际情况改改代码就可以实现更强大的功能

:::

清洗后分词并停用词

 
# -*- coding: utf-8 -*-
# @Time : 2022/5/1 15:52
# @Author : MinChess
# @File : stop.py
# @Software: PyCharm
 
import jieba
import re
 
# 利用jieba对文本进行分词，返回切词后的list
def seg_doc(str_doc):
    # 正则处理原文本
    sent_list = str_doc.split('\n')
    # map内置高阶函数:一个函数和list，函数依次作用在list.
    sent_list = map(textParse, sent_list)  # 正则处理 
    # 获取停用词
    stwlist = get_stop_words()
    # 分词并去除停用词
    word_2dlist = [rm_tokens(jieba.cut(part, cut_all=False), stwlist) for part in sent_list]
    # 合并列表
    word_list = sum(word_2dlist, [])
    return word_list
 
# 正则对字符串清洗
def textParse(str_doc):
    r1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./:：;；|<=>?@，—。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    str_doc=re.sub(r1, ' ', str_doc)
    return str_doc
 
# 创建停用词列表
def get_stop_words(path=r'自定义词表.txt'):
    file = open(path, 'r', encoding='utf-8').read().split('\n')
    print(set(file))
    return set(file)
 
# 去掉一些停用词和数字
def rm_tokens(words, stwlist):
    words_list = list(words)
    stop_words = stwlist
    for i in range(words_list.__len__())[::-1]:
        if words_list[i] in stop_words:  # 去除停用词
            words_list.pop(i)
        elif words_list[i].isdigit():  # 去除数字
            words_list.pop(i)
        elif len(words_list[i]) == 1:  # 去除单个字符
            words_list.pop(i)
        # elif words_list[i] == " ":  # 去除空字符 
            # words_list.pop(i) 
    return words_list
 
 
# 读取文本信息
def readFile(path):
    str_doc = "" 
    with open(path, 'r', encoding='utf-8') as f:
        str_doc = f.read()
    return str_doc
 
 
if __name__ == '__main__':
    path = r'待处理文件.txt'
    str_doc = readFile(path)
 
    word_list = seg_doc(str_doc)
    print(word_list)
 
    wl_space_split = " ".join(word_list)
    print(wl_space_split)
 
#这里把数据自定义停用词后输出txt文档
    result2txt = str(wl_space_split)
    with open('lunwen2.txt', 'a') as file_handle:
        file_handle.write(result2txt)
        file_handle.write('\n')

分词并统计词频

 
# -*- coding: utf-8 -*-
# @Time : 2022/5/1 15:23
# @Author : MinChess
# @File : jieba.py
# @Software: PyCharm
import jieba
import re
import time
from collections import Counter
 
 
cut_words = ""
all_words = ""
f = open('已分词.txt', 'w')
for line in open('待分词.txt', encoding='utf-8'):
    line.strip('\n')
    seg_list = jieba.cut(line,cut_all=False)
    cut_words = (" ".join(seg_list))
    f.write(cut_words)
    all_words += cut_words
else:
    f.close()
 
# 输出结果
all_words = all_words.split()
print(all_words)
 
# 词频统计
c = Counter()
for x in all_words:
    if len(x)>1 and x != '\r\n':
        c[x] += 1
 
# 输出前20
print('\n词频统计结果：')
for (k,v) in c.most_common(20):
    print("%s:%d"%(k,v))
 
# 存储词频
name = "词频.csv"
fw = open(name, 'w', encoding='utf-8')
i = 1
for (k,v) in c.most_common(len(c)):
    fw.write(str(i)+','+str(k)+','+str(v)+'\n')
    i = i + 1
else:
    print("完成写入!")
    fw.close()

	# -- coding: utf-8 --
	# @Time : 2022/5/1 15:52
	# @Author : MinChess
	# @File : test.py
	# @Software: PyCharm
	import jieba

	seg_list = jieba.cut("我在东北师范大学测试结巴库", cut_all=True)
	print("/ ".join(seg_list)) # 全模式

	seg_list = jieba.cut("我在东北师范大学测试结巴库", cut_all=False)
	print( "/ ".join(seg_list)) # 精确模式

	seg_list = jieba.cut("我在东北师范大学测试结巴库") # 默认是精确模式
	print("/ ".join(seg_list))

	seg_list = jieba.cut_for_search("我就读与东北师范大学，我的专业是信息资源管理") # 搜索引擎模式
	print("/ ".join(seg_list))

	# -- coding: utf-8 --
	# @Time : 2022/5/1 15:23
	# @Author : MinChess
	# @File : jieba.py
	# @Software: PyCharm
	import jieba
	import re
	import time
	from collections import Counter


	cut_words = ""
	all_words = ""
	f = open('已分词.txt', 'w')
	for line in open('待分词.txt', encoding='utf-8'):
	line.strip('\n')
	seg_list = jieba.cut(line,cut_all=False)
	cut_words = (" ".join(seg_list))
	f.write(cut_words)
	all_words += cut_words
	else:
	f.close()

	# 输出结果
	all_words = all_words.split()
	print(all_words)

	# 词频统计
	c = Counter()
	for x in all_words:
	if len(x)>1 and x != '\r\n':
	c[x] += 1

	# 输出前20
	print('\n词频统计结果：')
	for (k,v) in c.most_common(20):
	print("%s:%d"%(k,v))

	# 存储词频
	name = "词频.csv"
	fw = open(name, 'w', encoding='utf-8')
	i = 1
	for (k,v) in c.most_common(len(c)):
	fw.write(str(i)+','+str(k)+','+str(v)+'\n')
	i = i + 1
	else:
	print("完成写入!")
	fw.close()