Python文件操作

发布于:2022-12-10 ⋅ 阅读:(527) ⋅ 点赞:(0)

任务一:Harry Potter词频统计及人名词云。验证性实验,可以不写实验报告册。

任务二:三国演义人名词云。

设计一个程序,读出“三国演义.txt”文件中的三国演义全文,将常见人名进行去重后生成词云,并列出词频最高的10-20个词,并形成词云(可以与不同的形状)。

例如‘玄德’,‘刘备’,‘刘皇叔’,‘皇叔’都是同一个人。

任务拓展:利用列表或字典来存储同一个人。学会修改
wordcloud.WordCloud() 的参数,对形成的词云字体和形状等进行改变。

 任务一

import string
import wordcloud

name = []
times = []
counts = {}
txt = open( 'Harry Potter.txt', "r" ).read()  # 打开文本文件
for ch in string.punctuation + '"':  # 删除符号
    txt = txt.replace( ch, '' )
words = txt.split()
excludes = {'The', 'You', 'Professor', 'But', 'Well', 'And', 'They', 'What', 'She',
            'There', "Yeah", "That",  "Dark", "Then", "How", 
            "Death", "Yes", "Mrs", "Ive", "Its", "Ill", "Uncle", "This"}  # 删除些不必要的词
def csv(storage):
        if word == "Harry" or word == "Potter":
            rword = "Harry.Potter"
        elif word == "Ron" or word == "Weasley":
            rword = "Ron.Weasley"
        elif word == "Hermione" or word == "Granger":
            rword = "Hermione.Granger"
        elif word == "Albus" or word == "Dumbledore":
            rword = "Albus.Dumbledore"
        elif word == "Draco" or word == "Malfoy":
            rword = "Draco.Malfoy"
        elif word == "Tom" or word == "Riddle" or word == "Voldemort":
            rword = "Voldemort"
        elif word == "Fred":
            rword = "Fred.Weasley"
        elif word == "Severus" or word == "Snape":
            rword = "Severus.Snape"
        elif word == "Rubeus" or word == "Hagrid":
            rword = "Rubeus.Hagrid"
        elif word == "Sirius" or word == "Black":
            rword = "Sirius.Black"
        else:
            rword = word
        return rword
for word in words:
    if len(word) <3:  # 剔除长度小于3的单词
        continue
    if word.istitle():
        word = csv(word)
        counts[word] = counts.get(word, 0) + 1
for wordes in excludes:
    del counts[wordes]

items = list( counts.items() )
items.sort( key=lambda x: x[1], reverse=True )  # 排序

for i in range( 10 ):
    word, count = items[i]
    print( f"{word:<18}{count:>8}" )
    name.append( word )
    times.append( count )
   
    # 设置词云图的相关属性
c = wordcloud.WordCloud( background_color="white" ) 
c.generate( " ".join(name))
c.to_file( "e.jpg" )
 

任务二

import jieba
import wordcloud
import matplotlib.pyplot as plt
#打开文本文件并阅读
txt = open( 'Romance of the Three Kingdoms.txt', "r", encoding="utf-8" ).read()
# 除掉不相关的词语
excludes = {"将军", "却说", "荆州", "二人", "不可", "不能", "如此",
            "商议","如何","主公","军士","左右","军马","引兵","次日",
            "大喜","天下","东吴","于是","今日","不敢","陛下","一人",
            "都督","人马","不知","汉中","只见","众将","上马","大叫",
            "此人","蜀兵","太守","夫人","先主","后人","背后","城中",
            "后主","天子","一面","何不","大军"}
words = jieba.lcut( txt ) #使用结巴分词且返回一个列表类型txt表示变量名
counts = {} #词频统计--字典
#排除一人多名
for word in words:
    if len(word) == 1:#删除为一个字的词
        continue
    elif word == "诸葛亮" or word == "孔明曰":
        rword = "孔明"
    elif word == "关公" or word == "云长":
        rword = "关羽"
    elif word == "玄德" or word == "玄德曰":
        rword = "刘备"
    elif word == "孟德" or word == "丞相":
        rword = "曹操"
    else:
        rword = word
    counts[rword] = counts.get(rword, 0) + 1 # 符合相应条件自增

for word in excludes:
    del counts[word]
items = list( counts.items()) #二维列表
items.sort(key=lambda x:x[1], reverse=True) #根据出场率的大小进行降序排列
name = []
times = []
for i in range(100): #取前100的词频率高的词
    word,count = items[i]
    print( f"{word:<10}{count:>5}" )
    name.append(word)
    times.append(count)

#配置词图云相关的参数
w=wordcloud.WordCloud( width=400,font_path="msyh.ttc", 
                      height=300,background_color="Azure"
                     ) #设置词图云的宽度、高度以及字体和天蓝色的背景图

w.generate( " ".join(name)) #join函数是一个可迭代的对象
plt.imshow(w)
plt.axis('off') #不显示坐标
plt.show() #显示生成的词云图
w.to_file( "threeguo.png") #保存到本地

 

 

本文含有隐藏内容,请 开通VIP 后查看