言語処理100本ノック2020（python）備忘録40-49

まずはcabochaをインストール
$cabocha -f1 neko.txt>neko.txt.cabocha
を実行。-f1をオプションでつけた。

* 0 -1D 0/0 0.000000
一	名詞,数,*,*,*,*,一,イチ,イチ
EOS
EOS
* 0 2D 0/0 -0.764522
　	記号,空白,*,*,*,*,　,　,　
* 1 2D 0/1 -0.764522
吾輩	名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ
は	助詞,係助詞,*,*,*,*,は,ハ,ワ
* 2 -1D 0/2 0.000000
猫	名詞,一般,*,*,*,*,猫,ネコ,ネコ
で	助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ
ある	助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル
。	記号,句点,*,*,*,*,。,。,。

のようなファイルが生成される。

40

class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')

cabocha_sentence_list = []
for sentence in data:
    if sentence == '' or sentence == '\n':
        continue
    cabocha_word_list = []
    for word in sentence.split('\n'):
        if len(word.split('\t')) == 1: # タブでわけれらているのが単語とす
            continue
        surface = word.split('\t')[0]
        base    = word.split('\t')[1].split(',')[6]
        pos     = word.split('\t')[1].split(',')[0]
        pos1    = word.split('\t')[1].split(',')[1]
        morph = Morph(surface, base, pos, pos1)
        cabocha_word_list.append(morph)
    cabocha_sentence_list.append(cabocha_word_list)
morpheme_array = [morph.surface for morph in cabocha_sentence_list[2]]
print(morpheme_array)

出力は

['名前', 'は', 'まだ', '無い', '。']

単にメンバを持つだけのクラスを作る。
morph = Morph(surface, base, pos, pos1)
でそのクラスの要素を作り、cabocha_word_listに追加する。
3文目の形態素列のみを出力

41

問題文の意味があまりわからなかったので、
言語処理100本ノック 2020「41. 係り受け解析結果の読み込み（文節・係り受け）」 - u++の備忘録
を参考にした。

class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

class Chunk(object):
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []
        
def parseCabocha(sentence):
    def CreateChunk(tmp):
        if 0 < len(tmp):
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    dst = None
    tmp = []
    res = []
    for word in sentence.split('\n'):
        if word == '':
            tmp = CreateChunk(tmp)
        elif word[0] == '*':
            tmp = CreateChunk(tmp)
            dst = int(word.split(' ')[2].rstrip('D'))
        else:
            surface = word.split('\t')[0]
            base = word.split('\t')[1].split(',')[6]
            pos  = word.split('\t')[1].split(',')[0]
            pos1 = word.split('\t')[1].split(',')[1]
            morph = Morph(surface, base, pos, pos1)
            tmp.append(morph)

    for i, r in enumerate(res):
        res[r.dst].srcs.append(i)
    return res


PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')
output = []
for sentence in data:
    output.append(parseCabocha(sentence))

sentence_8th = output[9]
for i, chunk in enumerate(sentence_8th):
    chunk_out = ''.join([morph.surface for morph in chunk.morphs])
    dst_num = chunk.dst
    print(i, chunk_out, dst_num)

出力は

0 この 1
1 書生というのは 7
2 時々 4
3 我々を 4
4 捕えて 5
5 煮て 6
6 食うという 7
7 話である。 -1

1番目の文節とは、最後を意味しているのではなく、係り受けの先がないことを意味していそう。

* 42

# 事前準備 
# $cabocha -f1 neko.txt>neko.txt.cabocha
# オプションを忘れるな。

# 参考 https://upura.hatenablog.com/entry/2020/04/25/121525
class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

class Chunk(object):
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []
        
def parseCabocha(sentence):
    def CreateChunk(tmp):
        if 0 < len(tmp):
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    dst = ''
    tmp = []
    res = []
    for word in sentence.split('\n'):
        if word == '':
            tmp = CreateChunk(tmp)
        elif word[0] == '*':
            tmp = CreateChunk(tmp)
            dst = word.split(' ')[2].rstrip('D')
        else:
            surface = word.split('\t')[0]
            base = word.split('\t')[1].split(',')[6]
            pos  = word.split('\t')[1].split(',')[0]
            pos1 = word.split('\t')[1].split(',')[1]
            if pos == '記号':
                surface = ''
            morph = Morph(surface, base, pos, pos1)
            tmp.append(morph)

    for i, r in enumerate(res):
        res[int(r.dst)].srcs.append(i)
    return res


PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')
output = []
for sentence in data:
    output.append(parseCabocha(sentence))

sentence_8th = output[9]
for i, chunk in enumerate(sentence_8th):
    chunk_out = ''.join([morph.surface for morph in chunk.morphs])
    dst_num = int(chunk.dst)
    chunk_dst_r = ''.join([morph.surface for morph in sentence_8th[dst_num].morphs])
    srcs_num = [src for src in chunk.srcs]
    if dst_num != -1:
        print(i, chunk_out, dst_num, chunk_dst_r, srcs_num)

出力はあまりにも長いので、output[9]として8文目のみを出力した。

0 この 1 書生というのは []
1 書生というのは 7 話である [0]
2 時々 4 捕えて []
3 我々を 4 捕えて []
4 捕えて 5 煮て [2, 3]
5 煮て 6 食うという [4]
6 食うという 7 話である [5]

chunkを作っている間に、

if pos == '記号':
    surface = ''

とすることで、
記号をチャンクから除外している。
表示時に、

chunk_out = ''.join([morph.surface for morph in chunk.morphs if morph.pos != '記号'])
chunk_dst_r = ''.join([morph.surface for morph in sentence_8th[dst_num].morphs if morph.pos != '記号'])

と消してもいい。

if dst_num != -1:
として除外しているが、
dst_num == -1:
においてもちゃんとsrcは格納されている。

43

class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

class Chunk(object):
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []
        
def parseCabocha(sentence):
    def CreateChunk(tmp):
        if 0 < len(tmp):
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    dst = ''
    tmp = []
    res = []
    for word in sentence.split('\n'):
        if word == '':
            tmp = CreateChunk(tmp)
        elif word[0] == '*':
            tmp = CreateChunk(tmp)
            dst = int(word.split(' ')[2].rstrip('D'))
        else:
            surface = word.split('\t')[0]
            base = word.split('\t')[1].split(',')[6]
            pos  = word.split('\t')[1].split(',')[0]
            pos1 = word.split('\t')[1].split(',')[1]
            morph = Morph(surface, base, pos, pos1)
            tmp.append(morph)

    for i, r in enumerate(res):
        res[r.dst].srcs.append(i)
    return res

def has_noun_in_chunk(c):
    for morph in c.morphs:
        if morph.pos == '名詞':
            return True
    return False

def has_verb_in_chunk(c: Chunk):
    for morph in c.morphs:
        if morph.pos == '動詞':
            return True
    return False

PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')
output = []
for sentence in data:
    output.append(parseCabocha(sentence))

sentence_6th = output[7]
for i, chunk in enumerate(sentence_6th):
    chunk_out = ''.join([morph.surface for morph in chunk.morphs if morph.pos != '記号'])
    dst_num = chunk.dst
    chunk_dst_r = ''.join([morph.surface for morph in sentence_6th[dst_num].morphs if morph.pos != '記号'])
    if dst_num != -1 and has_noun_in_chunk(chunk) and has_verb_in_chunk(sentence_6th[dst_num]):
        print(chunk_out, '\t', chunk_dst_r)

出力は6行目だけを

吾輩は   見た
ここで   始めて
ものを   見た

44

dot言語では、

digraph {
        "吾輩は" -> "猫である"
        "名前は" -> "無い"
        "まだ" -> "無い"
}

のように書くとそれぞれを矢印でつないだ図を意味し、Graphvizなどで生成できる。

from graphviz import Digraph
class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

class Chunk(object):
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []
        
def parseCabocha(sentence):
    def CreateChunk(tmp):
        if 0 < len(tmp):
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    dst = ''
    tmp = []
    res = []
    for word in sentence.split('\n'):
        if word == '':
            tmp = CreateChunk(tmp)
        elif word[0] == '*':
            tmp = CreateChunk(tmp)
            dst = int(word.split(' ')[2].rstrip('D'))
        else:
            surface = word.split('\t')[0]
            base = word.split('\t')[1].split(',')[6]
            pos  = word.split('\t')[1].split(',')[0]
            pos1 = word.split('\t')[1].split(',')[1]
            morph = Morph(surface, base, pos, pos1)
            tmp.append(morph)

    for i, r in enumerate(res):
        res[r.dst].srcs.append(i)
    return res

PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')
sentences = []
for sentence in data:
    sentences.append(parseCabocha(sentence))

sentences = sentences[:10]
dot = Digraph()

PATH_O = '44_kakariuke.gv'
with open(PATH_O, 'w') as w:
    w.write('digraph graphname {\n')
    for sentence in sentences:
        for i, chunk in enumerate(sentence):
            chunk_out = ''.join([morph.surface for morph in chunk.morphs if morph.pos != '記号'])
            dst_num = chunk.dst
            chunk_dst_r = ''.join([morph.surface for morph in sentence[dst_num].morphs if morph.pos != '記号'])
            if dst_num != -1 and chunk_out != '' and chunk_dst_r != '': 
                w.write('' + chunk_out + '->' + chunk_dst_r + ';' + '\n')
                dot.edge(chunk_out, chunk_dst_r)
    w.write('}\n')
#$ dot -Tsvg 44_kakariuke.gv -o 44_kakariuke.svg
print(dot)
dot.render('44_output') #pdf が出力される

10文目までを出力すると

digraph {
        "吾輩は" -> "猫である"
        "名前は" -> "無い"
        "まだ" -> "無い"
        "どこで" -> "生れたか"
        "生れたか" -> "つかぬ"
        "とんと" -> "つかぬ"
        "見当が" -> "つかぬ"
        "何でも" -> "薄暗い"
        "薄暗い" -> "所で"
        "じめじめした" -> "所で"
        "所で" -> "泣いて"
        "ニャーニャー" -> "泣いて"
        "泣いて" -> "記憶している"
        "いた事だけは" -> "記憶している"
        "吾輩は" -> "見た"
        "ここで" -> "始めて"
        "始めて" -> "人間という"
        "人間という" -> "ものを"
        "ものを" -> "見た"
        "しかも" -> "種族であったそうだ"
        "あとで" -> "聞くと"
        "聞くと" -> "種族であったそうだ"
        "それは" -> "種族であったそうだ"
        "書生という" -> "人間中で"
        "人間中で" -> "種族であったそうだ"
        "一番" -> "獰悪な"
        "獰悪な" -> "種族であったそうだ"
        "この" -> "書生というのは"
        "書生というのは" -> "話である"
        "時々" -> "捕えて"
        "我々を" -> "捕えて"
        "捕えて" -> "煮て"
        "煮て" -> "食うという"
        "食うという" -> "話である"
}

であり、図は

となる。

45

class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

class Chunk(object):
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []
        
def parseCabocha(sentence):
    def CreateChunk(tmp):
        if 0 < len(tmp):
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    dst = ''
    tmp = []
    res = []
    for word in sentence.split('\n'):
        if word == '':
            tmp = CreateChunk(tmp)
        elif word[0] == '*':
            tmp = CreateChunk(tmp)
            dst = int(word.split(' ')[2].rstrip('D'))
        else:
            surface = word.split('\t')[0]
            base = word.split('\t')[1].split(',')[6]
            pos  = word.split('\t')[1].split(',')[0]
            pos1 = word.split('\t')[1].split(',')[1]
            morph = Morph(surface, base, pos, pos1)
            tmp.append(morph)

    for i, r in enumerate(res):
        res[r.dst].srcs.append(i)
    return res

def has_verb_in_chunk(c: Chunk):
    for morph in c.morphs:
        if morph.pos == '動詞':
            return True
    return False

def first_verb_base_in_chunk(c:Chunk):
    for morph in c.morphs:
        if morph.pos == '動詞':
            return morph.base
    print('ERROR! in first_verb_in_chunk')
    
PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')
sentences = [parseCabocha(sentence) for sentence in data]

PATH_O = '45_kaku_pattern.txt'
with open(PATH_O, 'w') as w:
    for sentence in sentences:
        for i, chunk in enumerate(sentence):
            if has_verb_in_chunk(chunk):
                joshis = []
                first_verb_base = first_verb_base_in_chunk(chunk)
                for i in chunk.srcs: 
                    if sentence[i].morphs[-1].pos == '助詞':
                        joshis += [sentence[i].morphs[-1].surface]
                if joshis != []:
                    joshi_list_for_print = ' '.join(sorted(joshis))
                    w.write(first_verb_base+'\t'+joshi_list_for_print+'\n')

出力は

生れる	で
つく	か が
泣く	で
する	て は
始める	で

で始まる。
動詞を含む文節に対し、そのsrcの中から助詞で終わる文節を取り出し、最後の単語のsurfaceをjoshisに入れ、動詞と組み合わせて書き出した。

$ sort 45_kaku_pattern.txt | uniq -c | sort -nr | head

を実行すると

 741 云う       と
 536 する       を
 356 思う       と
 241 ある       が
 236 なる       に
 223 する       に
 214 見る       て
 170 する       と
 133 する       が
 127 する       に を

を出力。

「する」の動詞の格パターンは

PATH_O = '45_kaku_pattern_suru.txt'
with open(PATH_O, 'w') as w:
    for sentence in sentences:
        for i, chunk in enumerate(sentence):
            if has_verb_in_chunk(chunk):
                joshis = []
                first_verb_base = first_verb_base_in_chunk(chunk)
                for i in chunk.srcs: 
                    if sentence[i].morphs[-1].pos == '助詞':
                        joshis += [sentence[i].morphs[-1].surface]
                if joshis != [] and first_verb_base == 'する':
                    joshi_list_for_print = ' '.join(sorted(joshis))
                    w.write(first_verb_base+'\t'+joshi_list_for_print+'\n')

と条件付きで書き出せば

$ sort 45_kaku_pattern_suru.txt | uniq -c | sort -nr | head

で

 536 する       を
 223 する       に
 170 する       と
 133 する       が
 127 する       に を
 104 する       は
  74 する       て を
  66 する       も
  66 する       て
  62 する       が を

「見る」

 214 見る       て
 105 見る       を
  22 見る       から
  19 見る       て は
  18 見る       て て
  17 見る       と
  13 見る       て を
  12 見る       で
  10 見る       から て
   9 見る       に

「与える」

   6 与える     に を
   2 与える     は を
   1 与える     か として
   1 与える     て と に は を
   1 与える     に に は を
   1 与える     て に は を
   1 与える     て と は を
   1 与える     は は も
   1 与える     て に を
   1 与える     も を

46

class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

class Chunk(object):
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []
        
def parseCabocha(sentence):
    def CreateChunk(tmp):
        if 0 < len(tmp):
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    dst = ''
    tmp = []
    res = []
    for word in sentence.split('\n'):
        if word == '':
            tmp = CreateChunk(tmp)
        elif word[0] == '*':
            tmp = CreateChunk(tmp)
            dst = int(word.split(' ')[2].rstrip('D'))
        else:
            surface = word.split('\t')[0]
            base = word.split('\t')[1].split(',')[6]
            pos  = word.split('\t')[1].split(',')[0]
            pos1 = word.split('\t')[1].split(',')[1]
            morph = Morph(surface, base, pos, pos1)
            tmp.append(morph)

    for i, r in enumerate(res):
        res[r.dst].srcs.append(i)
    return res

def has_verb_in_chunk(c: Chunk):
    for morph in c.morphs:
        if morph.pos == '動詞':
            return True
    return False

def first_verb_base_in_chunk(c:Chunk):
    for morph in c.morphs:
        if morph.pos == '動詞':
            return morph.base
    print('ERROR! in first_verb_in_chunk')
    
PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')
sentences = [parseCabocha(sentence) for sentence in data]

PATH_O = '46_kaku_frame.txt'
with open(PATH_O, 'w') as w:
    for sentence in sentences:
        for chunk in sentence:
            if not has_verb_in_chunk(chunk):
                continue
            # 助詞がある場合はここからしたが実行される
            joshis = []
            joshis_chunk_surface = []
            first_verb_base = first_verb_base_in_chunk(chunk)
            for i in chunk.srcs: 
                if sentence[i].morphs[-1].pos == '助詞':
                    joshis += [sentence[i].morphs[-1].surface]
                    joshis_chunk_surface += [''.join([morph.surface for morph in sentence[i].morphs if morph.pos != '記号'])]
                    joshi_pair = list(zip(joshis, joshis_chunk_surface))
            if joshis != []:
                sorted_joshi_pair = sorted(joshi_pair, key = lambda x: x[0])
                output_line = first_verb_base
                for joshi, joshi_chunk in sorted_joshi_pair:
                    output_line += '\t'+joshi
                    output_line += ' '+joshi_chunk
                w.write(output_line+'\n')

出力は

生れる	で どこで
つく	か 生れたか	が 見当が
泣く	で 所で
する	て 泣いて	は いた事だけは
始める	で ここで

階層が深くなってきたため、適宜リストなどにした方がいいのだろうか？

47

class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

class Chunk(object):
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []
        
def parseCabocha(sentence):
    def CreateChunk(tmp):
        if 0 < len(tmp):
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    dst = ''
    tmp = []
    res = []
    for word in sentence.split('\n'):
        if word == '':
            tmp = CreateChunk(tmp)
        elif word[0] == '*':
            tmp = CreateChunk(tmp)
            dst = int(word.split(' ')[2].rstrip('D'))
        else:
            surface = word.split('\t')[0]
            base = word.split('\t')[1].split(',')[6]
            pos  = word.split('\t')[1].split(',')[0]
            pos1 = word.split('\t')[1].split(',')[1]
            morph = Morph(surface, base, pos, pos1)
            tmp.append(morph)

    for i, r in enumerate(res):
        res[r.dst].srcs.append(i)
    return res

def has_verb_in_chunk(c: Chunk):
    for morph in c.morphs:
        if morph.pos == '動詞':
            return True
    return False

def has_sahen_in_chunk(c:Chunk):
    for morph in c.morphs:
        if morph.pos == '名詞' and morph.pos1 == 'サ変接続':
            return True
    return False

def pos_sahen_in_chunk(c:Chunk):
    for i, morph in enumerate(c.morphs):
        if morph.pos == '名詞' and morph.pos1 == 'サ変接続':
            return i
    print('ERROR!')
    return 'ERROR!'

def surface_from_chunk(c:Chunk):
    ret = ''
    for morph in c.morphs:
        ret += morph.surface
    return ret

def first_verb_base_in_chunk(c:Chunk):
    for morph in c.morphs:
        if morph.pos == '動詞':
            return morph.base
    print('ERROR! in first_verb_in_chunk')

def last_pos_from_chunk(c:Chunk):
    ret = '記号'
    for morph in c.morphs:
        if morph.pos != '記号':
            ret = morph.pos
    return ret
def last_surface_from_chunk(c:Chunk):
    ret = '記号 ERROR!'
    for morph in c.morphs:
        if morph.pos != '記号':
            ret = morph.surface
    return ret


PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')
sentences = [parseCabocha(sentence) for sentence in data]

PATH_O = '47_mining.txt'
with open(PATH_O, 'w') as w:
    for sentence in sentences:
        for chunk in sentence:
            if not has_sahen_in_chunk(chunk):
                continue
            if pos_sahen_in_chunk(chunk)+1 >= len(chunk.morphs) or chunk.morphs[pos_sahen_in_chunk(chunk)+1].surface != 'を':
                continue
            # サ変接続名詞+を　を含む文節では処理が続く
            if sentence[chunk.dst].morphs[0].pos != '動詞':
                continue
            # サ変接続名詞+を が動詞に係っている場合には処理が続く
            chunk_surface = ''.join([morph.surface for morph in chunk.morphs if morph.pos != '記号'])
            chunk_dst_verb_base = sentence[chunk.dst].morphs[0].base
            predicate = chunk_surface+chunk_dst_verb_base
            # ここで、係り受けとしては「chunk.dst」がdstであるものが述語に係る。
            joshi_list = []
            joshis_chunk_surface = []
            for chunk2 in sentence:
                if chunk2 == chunk or chunk2.dst != chunk.dst:
                    continue
                if last_pos_from_chunk(chunk2) != '助詞':
                    continue
                joshi_list += [last_surface_from_chunk(chunk2)]
                joshis_chunk_surface += [''.join([morph.surface for morph in chunk2.morphs if morph.pos != '記号'])]
            if joshi_list != []:
                w.write(predicate+'\t'+' '.join(joshi_list)+'\t'+' '.join(joshis_chunk_surface)+'\n')

出力は

決心をする	と	こうと
返報をする	んで	偸んで
昼寝をする	が	彼が
迫害を加える	て	追い廻して
家族的生活をする	が を	我等猫族が 愛を

で始まる。

48

class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

class Chunk(object):
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []
        
def parseCabocha(sentence):
    def CreateChunk(tmp):
        if 0 < len(tmp):
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    dst = ''
    tmp = []
    res = []
    for word in sentence.split('\n'):
        if word == '':
            tmp = CreateChunk(tmp)
        elif word[0] == '*':
            tmp = CreateChunk(tmp)
            dst = int(word.split(' ')[2].rstrip('D'))
        else:
            surface = word.split('\t')[0]
            base = word.split('\t')[1].split(',')[6]
            pos  = word.split('\t')[1].split(',')[0]
            pos1 = word.split('\t')[1].split(',')[1]
            morph = Morph(surface, base, pos, pos1)
            tmp.append(morph)

    for i, r in enumerate(res):
        res[r.dst].srcs.append(i)
    return res

def has_noun_in_chunk(c: Chunk):
    for morph in c.morphs:
        if morph.pos == '名詞':
            return True
    return False

PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')
sentences = [parseCabocha(sentence) for sentence in data]

sentences = sentences[7:11]

for sentence in sentences:
    for chunk in sentence:
        if not has_noun_in_chunk(chunk):
            continue
        #名詞がある場合
        chain = [''.join(morph.surface for morph in chunk.morphs if morph.pos != '記号')]
        while chunk.dst != -1:
            chunk = sentence[chunk.dst]
            chain += [''.join(morph.surface for morph in chunk.morphs if morph.pos != '記号')]
        print(' -> '.join(chain))

sentences = sentences[7:11]とし、7〜10行目における出力は

吾輩は -> 見た
ここで -> 始めて -> 人間という -> ものを -> 見た
人間という -> ものを -> 見た
ものを -> 見た
あとで -> 聞くと -> 種族であったそうだ
それは -> 種族であったそうだ
書生という -> 人間中で -> 種族であったそうだ
人間中で -> 種族であったそうだ
一番 -> 獰悪な -> 種族であったそうだ
獰悪な -> 種族であったそうだ
種族であったそうだ
書生というのは -> 話である
我々を -> 捕えて -> 煮て -> 食うという -> 話である
話である
当時は -> なかったから -> 思わなかった
何という -> 考も -> なかったから -> 思わなかった
考も -> なかったから -> 思わなかった

49

class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

class Chunk(object):
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []
        
def parseCabocha(sentence):
    def CreateChunk(tmp):
        if 0 < len(tmp):
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    dst = ''
    tmp = []
    res = []
    for word in sentence.split('\n'):
        if word == '':
            tmp = CreateChunk(tmp)
        elif word[0] == '*':
            tmp = CreateChunk(tmp)
            dst = int(word.split(' ')[2].rstrip('D'))
        else:
            surface = word.split('\t')[0]
            base = word.split('\t')[1].split(',')[6]
            pos  = word.split('\t')[1].split(',')[0]
            pos1 = word.split('\t')[1].split(',')[1]
            morph = Morph(surface, base, pos, pos1)
            tmp.append(morph)

    for i, r in enumerate(res):
        res[r.dst].srcs.append(i)
    return res

def has_noun_in_chunk(c: Chunk):
    for morph in c.morphs:
        if morph.pos == '名詞':
            return True
    return False

def surface_from_chunk(c:Chunk):
    ret = ''
    for morph in c.morphs:
        if morph.pos != '記号':
            ret += morph.surface
    return ret

def Xized(c:Chunk):
    ret = ''
    for morph in c.morphs:
        if morph.pos == '名詞':
            noun_flag = 1
        elif noun_flag == 1:
            ret += 'X'+morph.surface
        elif morph.pos != '記号':
            ret += morph.surface
    return ret

def Yized(c:Chunk):
    ret = ''
    for morph in c.morphs:
        if morph.pos == '名詞':
            noun_flag = 1
        elif noun_flag == 1:
            ret += 'Y'+morph.surface
        elif morph.pos != '記号':
            ret += morph.surface
    return ret

PATH = 'neko.txt.cabocha'
with open(PATH) as r:
    data = r.read().split('EOS')

sentences = [parseCabocha(sentence) for sentence in data]

sentences = sentences[7:8]

for sentence in sentences:
    for i, chunk in enumerate(sentence):
        if not has_noun_in_chunk(chunk):
            continue
        #名詞がある場合
        for j in range(i+1,len(sentence)):
            if not has_noun_in_chunk(sentence[j]):
                continue
            chain1 = [Xized(chunk)]
            chain2 = [Yized(sentence[j])]
            i_tmp = i
            j_tmp = j
            branch_flag = 0
            while i_tmp != j_tmp:
                if i_tmp < j_tmp:
                    i_tmp = int(sentence[i_tmp].dst)
                    chain1 += [surface_from_chunk(sentence[i_tmp])]
                else:
                    j_tmp = int(sentence[j_tmp].dst)
                    chain2 += [surface_from_chunk(sentence[j_tmp])]
                    branch_flag = 1
            if branch_flag == 0:
                print(' -> '.join(chain1[:-1]) + '-> Y')
            else :
                print(' -> '.join(chain1[:-1]) + ' | ' + ' -> '.join(chain2[:-1]) + ' | ' + chain1[-1])

出力は

Xは | Yで -> 始めて -> 人間という -> ものを | 見た
Xは | Yという -> ものを | 見た
Xは | Yを | 見た
Xで -> 始めて-> Y
Xで -> 始めて -> 人間という-> Y
Xという-> Y

chain1とchain2を係る文節へとたどる。
このとき、先頭に近い側のchainを伸ばすことで、最短パスを見つけた。
一度でもchain2を伸ばした場合は「文節iから構文木の根に至る経路上に文節jが存在する場合」ではないので、branch_flagを1にして表示を切り替えた。