任务一:Harry Potter词频统计及人名词云。验证性实验,可以不写实验报告册。
任务二:三国演义人名词云。
设计一个程序,读出“三国演义.txt”文件中的三国演义全文,将常见人名进行去重后生成词云,并列出词频最高的10-20个词,并形成词云(可以与不同的形状)。
例如‘玄德’,‘刘备’,‘刘皇叔’,‘皇叔’都是同一个人。
任务一
import string
import wordcloud
name = []
times = []
counts = {}
txt = open( 'Harry Potter.txt', "r" ).read() # 打开文本文件
for ch in string.punctuation + '"': # 删除符号
txt = txt.replace( ch, '' )
words = txt.split()
excludes = {'The', 'You', 'Professor', 'But', 'Well', 'And', 'They', 'What', 'She',
'There', "Yeah", "That", "Dark", "Then", "How",
"Death", "Yes", "Mrs", "Ive", "Its", "Ill", "Uncle", "This"} # 删除些不必要的词
def csv(storage):
if word == "Harry" or word == "Potter":
rword = "Harry.Potter"
elif word == "Ron" or word == "Weasley":
rword = "Ron.Weasley"
elif word == "Hermione" or word == "Granger":
rword = "Hermione.Granger"
elif word == "Albus" or word == "Dumbledore":
rword = "Albus.Dumbledore"
elif word == "Draco" or word == "Malfoy":
rword = "Draco.Malfoy"
elif word == "Tom" or word == "Riddle" or word == "Voldemort":
rword = "Voldemort"
elif word == "Fred":
rword = "Fred.Weasley"
elif word == "Severus" or word == "Snape":
rword = "Severus.Snape"
elif word == "Rubeus" or word == "Hagrid":
rword = "Rubeus.Hagrid"
elif word == "Sirius" or word == "Black":
rword = "Sirius.Black"
else:
rword = word
return rword
for word in words:
if len(word) <3: # 剔除长度小于3的单词
continue
if word.istitle():
word = csv(word)
counts[word] = counts.get(word, 0) + 1
for wordes in excludes:
del counts[wordes]
items = list( counts.items() )
items.sort( key=lambda x: x[1], reverse=True ) # 排序
for i in range( 10 ):
word, count = items[i]
print( f"{word:<18}{count:>8}" )
name.append( word )
times.append( count )
# 设置词云图的相关属性
c = wordcloud.WordCloud( background_color="white" )
c.generate( " ".join(name))
c.to_file( "e.jpg" )
任务二
import jieba
import wordcloud
import matplotlib.pyplot as plt
#打开文本文件并阅读
txt = open( 'Romance of the Three Kingdoms.txt', "r", encoding="utf-8" ).read()
# 除掉不相关的词语
excludes = {"将军", "却说", "荆州", "二人", "不可", "不能", "如此",
"商议","如何","主公","军士","左右","军马","引兵","次日",
"大喜","天下","东吴","于是","今日","不敢","陛下","一人",
"都督","人马","不知","汉中","只见","众将","上马","大叫",
"此人","蜀兵","太守","夫人","先主","后人","背后","城中",
"后主","天子","一面","何不","大军"}
words = jieba.lcut( txt ) #使用结巴分词且返回一个列表类型txt表示变量名
counts = {} #词频统计--字典
#排除一人多名
for word in words:
if len(word) == 1:#删除为一个字的词
continue
elif word == "诸葛亮" or word == "孔明曰":
rword = "孔明"
elif word == "关公" or word == "云长":
rword = "关羽"
elif word == "玄德" or word == "玄德曰":
rword = "刘备"
elif word == "孟德" or word == "丞相":
rword = "曹操"
else:
rword = word
counts[rword] = counts.get(rword, 0) + 1 # 符合相应条件自增
for word in excludes:
del counts[word]
items = list( counts.items()) #二维列表
items.sort(key=lambda x:x[1], reverse=True) #根据出场率的大小进行降序排列
name = []
times = []
for i in range(100): #取前100的词频率高的词
word,count = items[i]
print( f"{word:<10}{count:>5}" )
name.append(word)
times.append(count)
#配置词图云相关的参数
w=wordcloud.WordCloud( width=400,font_path="msyh.ttc",
height=300,background_color="Azure"
) #设置词图云的宽度、高度以及字体和天蓝色的背景图
w.generate( " ".join(name)) #join函数是一个可迭代的对象
plt.imshow(w)
plt.axis('off') #不显示坐标
plt.show() #显示生成的词云图
w.to_file( "threeguo.png") #保存到本地