Word2Vecによる楽劇「ニュルンベルクのマイスタージンガー」(リヒャルト・ワーグナー作曲のオペラ)のドイツ語歌詞の解析と関係性可視化の試み
NetworkXとBokehを用いて、インタラクティヴなネットワーク図を作成した。
https://github.com/ytknzw/wagner/blob/master/Wagner2Vek_Meistersinger.ipynb
%matplotlib inline
# スクレイピング・データ整形
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
# NLP
import spacy
import hashlib
import gensim
import codecs
from gensim.models import word2vec
from gensim.models.phrases import Phrases, Phraser
# ネットワーク図作成
import networkx as nx
import matplotlib.pyplot as plt
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import Plot, Range1d, MultiLine, Circle,\
HoverTool, BoxZoomTool, ResetTool, TapTool, BoxSelectTool, WheelZoomTool, PanTool, SaveTool,\
ColumnDataSource, LabelSet
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
from bokeh.palettes import Spectral4
def remove_headfoot(responce):
"""
Remove headers and footers from a responce
Arguments
responce: responce of requests.get()
Return values
text: text without headers and footers
"""
responce.encoding = responce.apparent_encoding
text = re.sub('(<br />|\s{2,})(?=[0-9\u00c4\u00d6\u00dc\u00dfA-Z][\s.0-9\u00c4\u00d6\u00dc\u00dfA-Z]{2,}<br />)', '<br /><br />',
re.sub('<tr valign="top"><td>libretto by.+', '',
re.findall('<span class="act".+', re.sub('[\n\r\t]', '', responce.text))[0]))
return text
def text2df(text):
"""
Shape and cleanse text into a dataframe
Arguments
text: output of extract_text()
Return Values
df: a dataframe into which the text gets shaped
"""
df = pd.DataFrame()
texts = text.split('<b>')
aufzug = BeautifulSoup(texts.pop(0)).text
print(aufzug)
for t in texts:
szene = t.split('</b>')[0]
print(szene)
script = re.sub('(<br />)?<i>([\w\s\(\)\[\];,:.-]|<br />|</?t(d|r)[\w\s"=/]*>)+</i>', '', t.split('</b>')[1]).split('<br /><br />')
for s in script:
if s not in (None, ''):
s_list = re.sub('^<br />', '', s).split('<br />')
if len(s_list) > 1:
name = re.sub('<[\w\s"=/]+>|\s{2,}', '', s_list.pop(0))
s_text = re.sub('<[\w\s"=/]+>|\s{2,}', '', ' '.join(s_list))
df = pd.concat([df, pd.DataFrame({'aufzug': aufzug, 'szene': szene, 'name': name, 'text': s_text}, index=['szene'])])
return df.reset_index(drop=True)
dat = pd.DataFrame()
for i in range(1, 4):
res = requests.get(f'http://www.murashev.com/opera/Die_Meistersinger_von_N%C3%BCrnberg_libretto_German_Act_{i}')
text = remove_headfoot(res)
dat = pd.concat([dat, text2df(text)])
内容を確認。
dat
for i, t in enumerate(dat.text):
res = re.findall('[\s.0-9\u00c4\u00d6\u00dc\u00dfA-Z]{3,}', t)
if len(res) > 0:
print(i)
print(res)
単語数
len(' '.join(dat.text).split())
de = spacy.load('de')
dok = de(' '.join(dat.text))
#dok
# 最初の20個のトークンについて処理結果を確認。
for t in dok[0:20]:
print(t.text, t.norm_, t.lemma_, t.pos_, t.is_stop, t.is_alpha, t.is_punct)
t_lemma = ''
for t in dok:
if ~t.is_stop and t.is_alpha and t.pos_ in ['NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV']:
# ストップ語以外、アルファベットからなる語、品詞が名詞、動詞、形容詞、固有名詞、副詞。
t_lemma = t_lemma + '\n' + t.lemma_
#t_lemma
# with codecs.open("Meistersinger.txt", "w", "utf-8") as f:
# f.write(t_lemma)
# with codecs.open("Meistersinger.txt", "r", "utf-8") as f:
# corpus = f.read().splitlines()
# print(corpus)
corpus = [sentence.split() for sentence in t_lemma.splitlines()]
# https://stackoverflow.com/questions/34831551/ensure-the-gensim-generate-the-same-word2vec-model-for-different-runs-on-the-sam/34849797
# gensimの再現性を確保するための自前ハッシュ関数
def dummyhash(string):
hash_value = int(hashlib.sha256(string.encode()).hexdigest(), 16)
return hash_value
# モデルを作成
model = word2vec.Word2Vec(corpus, size=100, min_count=3, window=5, hs=1, negative=0,
# gensimの再現性を確保するための設定
workers=1, hashfxn=dummyhash)
model.save('Meistersinger.model')
# 上で保存したモデルを読み込む: model = Word2Vec.load('Meistersinger.model')
モデルの内容を確認する。
model.wv.vocab.keys()
len(model.wv.vocab.keys())
model.wv['Walther']
単語間のsimilarity
model.wv.similarity('Walther', 'Beckmesser')
model.wv.similarity('Walther', 'Sachs')
model.wv.similarity('Walther', 'Meistersinger')
model.wv.similarity('David', 'Meistersinger')
model.wv.similarity('Gott', 'Lied')
ある単語に近しい単語
similar_words = model.wv.most_similar(positive=["Walther"], topn=10)
print(*[" ".join([v, str("{:.2f}".format(s))]) for v, s in similar_words], sep="\n")
similar_words = model.wv.most_similar(positive=["Stolzing"], topn=10)
print(*[" ".join([v, str("{:.2f}".format(s))]) for v, s in similar_words], sep="\n")
similar_words = model.wv.most_similar(positive=["Junker"], topn=10)
print(*[" ".join([v, str("{:.2f}".format(s))]) for v, s in similar_words], sep="\n")
similar_words = model.wv.most_similar(positive=["Eva"], topn=10)
print(*[" ".join([v, str("{:.2f}".format(s))]) for v, s in similar_words], sep="\n")
similar_words = model.wv.most_similar(positive=["Beckmesser"], topn=10)
print(*[" ".join([v, str("{:.2f}".format(s))]) for v, s in similar_words], sep="\n")
similar_words = model.wv.most_similar(positive=["Sachs"], topn=10)
print(*[" ".join([v, str("{:.2f}".format(s))]) for v, s in similar_words], sep="\n")
similar_words = model.wv.most_similar(positive=["David"], topn=10)
print(*[" ".join([v, str("{:.2f}".format(s))]) for v, s in similar_words], sep="\n")
similar_words = model.wv.most_similar(positive=["Lieb"], topn=10)
print(*[" ".join([v, str("{:.2f}".format(s))]) for v, s in similar_words], sep="\n")
similar_words = model.wv.most_similar(positive=["Lied"], topn=10)
print(*[" ".join([v, str("{:.2f}".format(s))]) for v, s in similar_words], sep="\n")
2単語間の差分
results = model.wv.most_similar(positive=['Walther'], negative=['Beckmesser'], topn=10)
for result in results:
print(result)
results = model.wv.most_similar(positive=['Sachs'], negative=['Beckmesser'], topn=10)
for result in results:
print(result)
results = model.wv.most_similar(positive=['Sachs'], negative=['Pogner'], topn=10)
for result in results:
print(result)
見やすいネットワーク図になるよう、出現頻度が20以上のノード、similarityが0.2超のedgeに限定した。
vokab = []
freq = []
for wort, vokab_obj in model.wv.vocab.items():
if vokab_obj.count >= 20 or wort in ['Walther', 'Eva', 'Beckmesser', 'Sachs', 'Pogner', 'Meistersinger', 'David']:
vokab += [wort]
freq += [vokab_obj.count]
出現頻度の分布確認
pd.Series(freq).value_counts().sort_index()
vokab[0:20]
len(vokab)
ネットワークグラフの作成
graph = nx.DiGraph()
for i, x in enumerate(vokab):
graph.add_node(x)
#graph.nodes[x]['wort'] = vokab[i]
graph.nodes[x]['freq'] = freq[i]
for x in vokab:
for y in vokab:
sim = model.wv.similarity(x, y)
if sim > 0.15:
graph.add_edge(x, y, weight=sim)
print(dict(graph.nodes))
graph.number_of_nodes()
graph.number_of_edges()
graph.number_of_selfloops()
print(list(nx.shortest_path(graph, source='Walther', target='Meistersinger')))
print(list(nx.shortest_path(graph, source='David', target='Meistersinger')))
print(list(nx.shortest_path(graph, source='Walther', target='Eva')))
print(list(nx.shortest_path(graph, source='Beckmesser', target='Eva')))
print(list(nx.shortest_path(graph, source='Walther', target='Beckmesser')))
print(list(nx.shortest_path(graph, source='Walther', target='Sachs')))
#print(list(nx.all_simple_paths(graph, source='Walther', target='Meistersinger')))
fig = plt.figure(figsize=(16,16), dpi=300)
ax = fig.add_subplot(111)
nx.draw_networkx(graph, with_labels=True, font_weight='bold')
output_notebook()
plot = Plot(plot_width=800, plot_height=800,
x_range=Range1d(-2,2), y_range=Range1d(-2,2))
plot.title.text = "Wagner2Vek: Ähnlichkeitgraph von \"Der Meistersinger von Nürnberg\" bei Wort2Vek"
node_hover_tool = HoverTool(tooltips=[("freq", "@freq")])
plot.add_tools(node_hover_tool, TapTool(), BoxSelectTool(), BoxZoomTool(), ResetTool(), WheelZoomTool(), PanTool(), SaveTool())
plot.toolbar.active_scroll = plot.select_one(WheelZoomTool)
graph_renderer = from_networkx(graph, nx.spring_layout, scale=1.5, center=(0,0))
graph_renderer.node_renderer.glyph = Circle(size=15, fill_color='#773280', fill_alpha=0.6, line_color='#773280')
graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
graph_renderer.node_renderer.hover_glyph = Circle(size=15, fill_color='firebrick', fill_alpha=0.6, line_color='firebrick')
graph_renderer.node_renderer.glyph.properties_with_values()
graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=5)
graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=5)
graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color='firebrick', line_alpha=0.6, line_width=5)
graph_renderer.selection_policy = NodesAndLinkedEdges()
graph_renderer.inspection_policy = NodesAndLinkedEdges()
plot.renderers.append(graph_renderer)
x, y = zip(*graph_renderer.layout_provider.graph_layout.values())
source = ColumnDataSource({'x': x, 'y': y,
'wort': graph_renderer.node_renderer.data_source.data['index']})
labels = LabelSet(x='x', y='y', text='wort', source=source, text_alpha=0.7)
plot.renderers.append(labels)
show(plot)
基本的に会話文であり話者間の方向性があるオペラの歌詞を全て結合して通常のテキストと同様に解析したが、次は話者と相手を考慮して分析・可視化したい。