言語処理100本ノック2020(python)備忘録50-59

newsCorpora.csv

"1	Fed official says weak data caused by weather"	" should not slow taper	http://www.latimes.com/business/money/la-fi-mo-federal-reserve-plosser-stimulus-economy-20140310"	0	"1312750.story\?track=rss	Los Angeles Times	b	ddUyU0VZz0BRneMioxUPQVP6sIxvM	www.latimes.com	1394470370698"
"2	Fed's Charles Plosser sees high bar for change in pace of tapering	http://www.livemint.com/Politics/H2EvwJSK2VE6OF7iK1g3PP/Feds-Charles-Plosser-sees-high-bar-for-change-in-pace-of-ta.html	Livemint	b	ddUyU0VZz0BRneMioxUPQVP6sIxvM	www.livemint.com	1394470371207"			
"3	US open: Stocks fall after Fed official hints at accelerated tapering	http://www.ifamagazine.com/news/us-open-stocks-fall-after-fed-official-hints-at-accelerated-tapering-294436	IFA Magazine	b	ddUyU0VZz0BRneMioxUPQVP6sIxvM	www.ifamagazine.com	1394470371550"			

と続くデータ。
\tで区切られ、[1]が題名、[3]が情報源、[4]がラベルである。

50

PATH = 'newsCorpora.csv'
import random

pub5_data = []
with open(PATH) as r:
    data = r.read()
lines = data.splitlines()
for line in lines:
    publisher = line.split('\t')[3]
    if publisher == 'Reuters' or publisher == 'Huffington Post' or publisher == 'Businessweek' or publisher =='Contactmusic.com' or publisher == 'Daily Mail':
        pub5_data += [[line.split('\t')[4],line.split('\t')[1]]]

pub5_data_shuffle = random.sample(pub5_data, len(pub5_data))
train = pub5_data_shuffle[:int(0.8*len(pub5_data_shuffle))]
valid = pub5_data_shuffle[int(0.8*len(pub5_data_shuffle)):int(0.9*len(pub5_data_shuffle))]
test =  pub5_data_shuffle[int(0.9*len(pub5_data_shuffle)):]

PATH_train = '50_train.txt'
PATH_valid = '50_valid.txt'
PATH_test  = '50_test.txt'
with open(PATH_train, 'w') as f_train:
    for line in train:
        f_train.write('\t'.join(line)+'\n')

with open(PATH_valid, 'w') as f_valid:
    for line in valid:
        f_valid.write('\t'.join(line)+'\n')

with open(PATH_test, 'w') as f_test:
    for line in test:
        f_test.write('\t'.join(line)+'\n')

random.sample関数でリストをランダムに並び替え、その長さの8割、9割の部分で区切り、それぞれをtrain, valid, testとした。
出力は例えば

e	Woman who MADE UP entire bestselling holocaust memoir is forced to pay back  ...
b	UK's FTSE 100 strengthens as Shire surges
e	Lady Gaga covers her entire head in a white furry mask to match her fringed  ...

などが書かれたファイルである。

51

from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

#train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割
def extract_X_y(data):
    X = []
    y = []
    data_train_sep = data.splitlines()
    for line in data_train_sep:
        y += [line.split('\t')[0]]
        X += [''.join(line[2:])]
    return X, y


PATH_train = 'train.txt'
with open(PATH_train) as f_train:
    data = f_train.read()
    X_train, y_train = extract_X_y(data)

PATH_valid = 'valid.txt'
with open(PATH_valid) as f_valid:
    data = f_valid.read()
    X_valid , y_valid = extract_X_y(data)

PATH_test = 'test.txt'
with open(PATH_test) as f_test:
    data = f_test.read()
    X_test, y_test = extract_X_y(data)

# yを書き出す
PATH_y_train = 'y_train.txt'
with open(PATH_y_train, 'w') as f_y_train:
    f_y_train.write('\n'.join(y_train))
PATH_y_valid = 'y_valid.txt'
with open(PATH_y_valid, 'w') as f_y_valid:
    f_y_valid.write('\n'.join(y_valid))
PATH_y_test  = 'y_test.txt'
with open(PATH_y_test , 'w') as f_y_test:
    f_y_test.write('\n'.join(y_test))

vec_count = CountVectorizer()
vec_count.fit(X_train)
X = vec_count.transform(X_train)
Y = vec_count.transform(X_valid)
Z = vec_count.transform(X_test)

'''0文目は疎行列表記で
(0, 581)      1
(0, 1549)     1
(0, 9871)     1
(0, 9897)     1
(0, 10088)    1
(0, 10916)    1
(0, 12664)    1
となっている'''

#一般的な行列の形へと変換
train_feature = X.toarray()
valid_feature = Y.toarray()
test_feature  = Z.toarray()
train_feature = train_feature.tolist()

PATH_train_feature = 'train.feature.txt'
with open(PATH_train_feature, 'w') as w1:
    for line in train_feature:
        w1.write(' '.join([str(char) for char in line]))

PATH_valid_feature = 'valid.feature.txt'
with open(PATH_valid_feature, 'w') as w2:
    for line in valid_feature:
        w2.write(' '.join([str(char) for char in line]))

PATH_test_feature = 'test.feature.txt'
with open(PATH_test_feature, 'w') as w3:
    for line in test_feature:
        w3.write(' '.join([str(char) for char in line]))

train内の語彙を使って、one-hotベクトルを作った。
サイズは大きいが、比較的簡単なためこの手法をとった。

52

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import numpy as np

#特徴量のラベルxと答えのラベルyに分割
def extract_X_y(data):
    X = []
    y = []
    data_train_sep = data.split('\n')
    for line in data_train_sep:
        if line == '':
            continue
        y += [line.split('\t')[0]]
        X += [''.join(line[2:])]
    return X, y

PATH_train = 'train.txt'
with open(PATH_train) as f_train:
    data = f_train.read()
    X_train, y_train = extract_X_y(data)

vec_count = CountVectorizer()
vec_count.fit(X_train)
X = vec_count.transform(X_train)

train_feature = X.toarray() #一般的な行列の形へと変換

# npに変換
X_train_np = np.array(train_feature)
y_train_np = np.array(y_train)

# ロジスティクス回帰
model = LogisticRegression(random_state=None, max_iter=1000)
model.fit(X_train_np, y_train_np)

from sklearn.linear_model import LogisticRegression

LogisticRegression
を用いた。
ただの配列ではなく、numpyのarrayを入れる必要がある。
LogisticRegression()でモデルを作成し、
そのモデルに.fit(特徴量, ラベル)を渡して学習させる。
回帰をしているだけで、何も出力しない。

53

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import numpy as np

def extract_X_y(data):
    X = []
    y = []
    data_train_sep = data.split('\n')
    for line in data_train_sep:
        if line == '':
            continue
        y += [line.split('\t')[0]]
        X += [''.join(line[2:])]
    return X, y
#train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割

with open('train.txt') as f_train:
    X_train, y_train = extract_X_y(f_train.read())
with open('valid.txt') as f_valid:
    X_valid , y_valid = extract_X_y(f_valid.read())
with open('test.txt') as f_test:
    X_test, y_test = extract_X_y(f_test.read())

vec_count = CountVectorizer()
vec_count.fit(X_train)

train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換
valid_feature = vec_count.transform(X_valid).toarray()
test_feature  = vec_count.transform(X_test).toarray()

# npに変換
X_train_np = np.array(train_feature)
y_train_np = np.array(y_train)

# ロジスティクス回帰
model = LogisticRegression(random_state=None, max_iter=10000)
model.fit(X_train_np, y_train_np)

pred_train = model.predict(X_train_np)
pred_train_proba = model.predict_proba(X_train_np) # b e m t
print('pred_train: ', pred_train)
print('pred_train_proba: ', pred_train_proba)

出力は例えば

pred_train:  ['b' 't' 'b' ... 'm' 'e' 'b']
pred_train_proba:  [[8.89689407e-01 1.85913832e-02 2.63040290e-02 6.54151808e-02]
 [1.34468354e-01 9.68217362e-02 5.51499875e-02 7.13559922e-01]
 [9.99953355e-01 6.10473669e-06 2.17606365e-05 1.87801207e-05]
 ...
 [1.29665708e-02 2.69734697e-02 7.49512905e-01 2.10547055e-01]
 [4.02224038e-02 9.20044044e-01 1.79165884e-02 2.18169643e-02]
 [8.70917324e-01 7.66668727e-02 9.93797901e-03 4.24778239e-02]]

学習したmodelに対して、.predict(特徴量)とするとロジスティクス回帰での判別結果をnumpy.arrayで得られる。
同様に.predict_proba(特徴量)とすると、判定の確率を得られる。

表示前に
np.set_printoptions(suppress=True, precision=3)
とすると、

pred_train:  ['b' 't' 'b' ... 'm' 'e' 'b']
pred_train_proba:  [[0.89  0.019 0.026 0.065]
 [0.134 0.097 0.055 0.714]
 [1.    0.    0.    0.   ]
 ...
 [0.013 0.027 0.75  0.211]
 [0.04  0.92  0.018 0.022]
 [0.871 0.077 0.01  0.042]]

となり、見やすい。

54

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

np.set_printoptions(suppress=True, precision=3, formatter={'float': '{:.2f}'.format})

def extract_X_y(data):
    X = []
    y = []
    data_train_sep = data.split('\n')
    for line in data_train_sep:
        if line == '':
            continue
        y += [line.split('\t')[0]]
        X += [''.join(line[2:])]
    return X, y
#train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割

with open('train.txt') as f_train:
    X_train, y_train = extract_X_y(f_train.read())
with open('valid.txt') as f_valid:
    X_valid , y_valid = extract_X_y(f_valid.read())
with open('test.txt') as f_test:
    X_test, y_test = extract_X_y(f_test.read())

vec_count = CountVectorizer()
vec_count.fit(X_train)

train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換
valid_feature = vec_count.transform(X_valid).toarray()
test_feature  = vec_count.transform(X_test).toarray()

# npに変換
X_train_np = np.array(train_feature)
X_valid_np = np.array(valid_feature)
X_test_np  = np.array(test_feature)

y_train_np = np.array(y_train)
y_valid_np = np.array(y_valid)
y_test_np  = np.array(y_test)

# ロジスティクス回帰
model = LogisticRegression(random_state=None, max_iter=10000)
model.fit(X_train_np, y_train_np)

pred_train = model.predict(X_train_np)# b e m t
acc_train = accuracy_score(y_train_np, pred_train)
print('acc_train: ', acc_train)

pred_valid = model.predict(X_valid_np)
acc_valid = accuracy_score(y_valid_np, pred_valid)
print('acc_valid: ', acc_valid)

pred_test = model.predict(X_test_np)
acc_test  = accuracy_score(y_test_np, pred_test)
print('acc_test: ', acc_test)

出力は例えば

acc_train:  0.9960688880569075
acc_valid:  0.905688622754491
acc_test:  0.9169161676646707

from sklearn.metrics import accuracy_score
とし
accuracy_score(正答, 回答)とすることで、正答率を計算。
one-hotだけで9割ほどの正解率を実現した。

55

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import numpy as np

def extract_X_y(data):
    X = []
    y = []
    data_train_sep = data.split('\n')
    for line in data_train_sep:
        if line == '':
            continue
        y += [line.split('\t')[0]]
        X += [''.join(line[2:])]
    return X, y
#train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割

with open('train.txt') as f_train:
    X_train, y_train = extract_X_y(f_train.read())
with open('valid.txt') as f_valid:
    X_valid , y_valid = extract_X_y(f_valid.read())
with open('test.txt') as f_test:
    X_test, y_test = extract_X_y(f_test.read())

print('X_train size =', len(X_train), len(X_train[0])) # 57文字程度の 10684要素

vec_count = CountVectorizer()
vec_count.fit(X_train)

train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換
valid_feature = vec_count.transform(X_valid).toarray()
test_feature  = vec_count.transform(X_test).toarray()

# npに変換
X_train_np = np.array(train_feature)
X_valid_np = np.array(valid_feature)
X_test_np  = np.array(test_feature)
y_train_np = np.array(y_train)
y_valid_np = np.array(y_valid)
y_test_np  = np.array(y_test)

# ロジスティクス回帰
model = LogisticRegression(random_state=None, max_iter=10000)
model.fit(X_train_np, y_train_np)

# 学習データでの結果
pred_train = model.predict(X_train_np)
Conf_Mat_train = confusion_matrix(y_train_np, pred_train)
print('Conf_Mat_train:\n',Conf_Mat_train)

# 評価データ
pred_test = model.predict(X_test_np)
Conf_Mat_test = confusion_matrix(y_test_np, pred_test)
print('Conf_Mat_test:\n',Conf_Mat_test)

出力は例えば

 [[4477    2    0   10]
 [   8 4218    1    2]
 [   2    2  731    0]
 [  12    2    0 1217]]
Conf_Mat_test:
 [[532  20   4  14]
 [  7 530   0   2]
 [ 11  11  45   7]
 [ 31  14   3 105]]

対角線上に正答の数が並んでいる。
例えばConf_Mat_trainの10は、0番目のカテゴリを3番目のカテゴリだと(誤)判定した数である。
問題50において分割した際、それぞれ4489 1231 4229 735個であったことからも確かめられる。*1

56

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import numpy as np

def extract_X_y(data):
    X = []
    y = []
    data_train_sep = data.split('\n')
    for line in data_train_sep:
        if line == '':
            continue
        y += [line.split('\t')[0]]
        X += [''.join(line[2:])]
    return X, y
#train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割

with open('50_train.txt') as f_train:
    X_train, y_train = extract_X_y(f_train.read())
with open('50_valid.txt') as f_valid:
    X_valid , y_valid = extract_X_y(f_valid.read())
with open('50_test.txt') as f_test:
    X_test, y_test = extract_X_y(f_test.read())

print('X_train size =', len(X_train), len(X_train[0])) # 57文字程度の 10684要素

vec_count = CountVectorizer()
vec_count.fit(X_train)

train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換
valid_feature = vec_count.transform(X_valid).toarray()
test_feature  = vec_count.transform(X_test).toarray()

# npに変換
X_train_np = np.array(train_feature)
X_valid_np = np.array(valid_feature)
X_test_np  = np.array(test_feature)

y_train_np = np.array(y_train)
y_valid_np = np.array(y_valid)
y_test_np  = np.array(y_test)

# サイズチェック
print('X_train_np size =', X_train_np.shape)
print('y_train_np size =', y_train_np.shape)

# ロジスティクス回帰
model = LogisticRegression(random_state=None, max_iter=10000)
model.fit(X_train_np, y_train_np)

# 学習データでの結果
pred_train = model.predict(X_train_np)# b e m t
acc_train = accuracy_score(y_train_np, pred_train)

# 評価データ
pred_test = model.predict(X_test_np)
acc_test  = accuracy_score(y_test_np, pred_test)
conf_mat_test = confusion_matrix(y_test_np, pred_test)
print('Conf_Mat_test:\n',conf_mat_test)# Conf_Mat_test[m][n]は、ラベルnをmだと予想した意
# 何段目にあるか?がどう予想したか。 #何列目か?が真の値
acc_test_by_category  = [np.sum([conf_mat_test[i][j] for i in range(4) for j in range(4) if i == k and j == k or i != k and j != k])/np.sum(conf_mat_test) for k in range(4)]
prec_test_by_category = [conf_mat_test[i][i]/np.sum(conf_mat_test, axis=0)[i] for i in range(4)]
reca_test_by_category = [conf_mat_test[i][i]/np.sum(conf_mat_test, axis=1)[i] for i in range(4)]

# 計算式の簡単のため一度変数名を変える
p = prec_test_by_category.copy()
r = reca_test_by_category.copy()
f1_test_by_category = [2*p[i]*r[i] / (p[i]+r[i]) for i in range(4)]

print(prec_test_by_category)
print(reca_test_by_category)
print(f1_test_by_category)

acc_macro = np.mean(acc_test_by_category)
prec_macro= np.mean(prec_test_by_category)
reca_macro= np.mean(reca_test_by_category)
f1_macro  = np.mean(f1_test_by_category)
print('acc_macro:  ' ,acc_macro)
print('prec_macro: ',prec_macro)
print('reca_macro: ',reca_macro)
print('f1_macro:   ', f1_macro)

acc_micro = np.mean(np.sum([conf_mat_test[i][i] for i in range(4)])/np.sum(conf_mat_test))
print('acc_micro = prec_micro = reca_micro: ', acc_micro)

print('---')

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
print (precision_score( y_test_np, pred_test, average=None))
print (recall_score( y_test_np, pred_test, average=None))
print (f1_score( y_test_np, pred_test, average=None))
print (precision_score( y_test_np, pred_test, average='micro' ))
print (recall_score( y_test_np, pred_test, average='micro' ))
print (f1_score( y_test_np, pred_test, average='micro' ))

print (precision_score( y_test_np, pred_test, average='macro' ))
print (recall_score( y_test_np, pred_test, average='macro' ))
print (f1_score( y_test_np, pred_test, average='macro' ))

出力例は

[0.9156626506024096, 0.9217391304347826, 0.8653846153846154, 0.8203125]
[0.9333333333333333, 0.9833024118738405, 0.6081081081081081, 0.6862745098039216]
[0.9244135534317984, 0.9515260323159784, 0.7142857142857144, 0.7473309608540926]
acc_macro:   0.9535928143712575
prec_macro:  0.8807747241054519
reca_macro:  0.8027545907798008
f1_macro:    0.8343890652218959
acc_micro = prec_micro = reca_micro:  0.907185628742515
---
[0.91566265 0.92173913 0.86538462 0.8203125 ]
[0.93333333 0.98330241 0.60810811 0.68627451]
[0.92441355 0.95152603 0.71428571 0.74733096]
0.907185628742515
0.907185628742515
0.907185628742515
0.8807747241054519
0.8027545907798008
0.8343890652218959

acc_test_by_categoryをリスト内包表記で書いた。
計算式の簡単のため一度変数名を変えたが、あまりよくなさそうなので多用しない。
acc_micro == prec_micro == reca_microであることで理解に自信がなくなるも正しい。
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
にすべてあったのでリスト内包表記を書く必要は無かった。
用意されたものを使うのが無難そう。*2

57

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import numpy as np

def extract_X_y(data):
    X = []
    y = []
    data_train_sep = data.split('\n')
    for line in data_train_sep:
        if line == '':
            continue
        y += [line.split('\t')[0]]
        X += [''.join(line[2:])]
    return X, y
#train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割

with open('50_train.txt') as f_train:
    X_train, y_train = extract_X_y(f_train.read())
with open('50_valid.txt') as f_valid:
    X_valid , y_valid = extract_X_y(f_valid.read())
with open('50_test.txt') as f_test:
    X_test, y_test = extract_X_y(f_test.read())

vec_count = CountVectorizer()
vec_count.fit(X_train)

train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換
valid_feature = vec_count.transform(X_valid).toarray()
test_feature  = vec_count.transform(X_test).toarray()

# npに変換
X_train_np = np.array(train_feature)
X_valid_np = np.array(valid_feature)
X_test_np  = np.array(test_feature)

y_train_np = np.array(y_train)
y_valid_np = np.array(y_valid)
y_test_np  = np.array(y_test)

# ロジスティクス回帰
model = LogisticRegression(random_state=None, max_iter=10000)
model.fit(X_train_np, y_train_np)

# 学習データでの結果
pred_train = model.predict(X_train_np)# b e m t
acc_train = accuracy_score(y_train_np, pred_train)

# 評価データでの結果
pred_test = model.predict(X_test_np)
acc_test = accuracy_score(y_test_np, pred_test)

for i, c_name in enumerate(['business', 'entertainment', 'medical', 'technology']):
    print('---typical word in ' + c_name + ' category---')
    index_list = np.argsort(model.coef_[i])
    sorted_coef_ = sorted(model.coef_[i])
    for i in list(range(-1, -11, -1))+list(range(9, -1, -1)):
        print(vec_count.get_feature_names()[index_list[i]], f': {sorted_coef_[i]:.4g}')

出力は例えば

---typical word in business category---
bank : 1.953
ecb : 1.713
fed : 1.707
yellen : 1.596
china : 1.583
dollar : 1.541
obamacare : 1.523
ukraine : 1.512
mcdonald : 1.457
euro : 1.45
star : -0.9617
cap : -0.9827
heartbleed : -0.9846
subscription : -1.026
nintendo : -1.058
she : -1.075
twitch : -1.126
ebola : -1.149
activision : -1.392
aereo : -1.505
---typical word in entertainment category---
kardashian : 1.573
chris : 1.548
thrones : 1.466
paul : 1.402
transformers : 1.393
miley : 1.383
movie : 1.378
cyrus : 1.315
film : 1.303
music : 1.271
climate : -1.037
study : -1.063
ebola : -1.1
data : -1.112
billion : -1.148
gm : -1.234
china : -1.252
facebook : -1.277
microsoft : -1.279
google : -1.531
---typical word in medical category---
ebola : 2.76
fda : 2.239
cancer : 2.108
mers : 1.904
study : 1.746
medical : 1.666
cases : 1.645
doctors : 1.576
drug : 1.546
cigarettes : 1.545
cars : -0.5223
mobile : -0.5231
netflix : -0.5927
bank : -0.6168
climate : -0.6489
twitter : -0.732
apple : -0.748
facebook : -0.75
dimon : -0.7779
gm : -0.8259
---typical word in technology category---
facebook : 2.757
google : 2.563
microsoft : 2.501
apple : 2.489
climate : 2.362
heartbleed : 1.906
fcc : 1.707
gm : 1.691
tesla : 1.655
nasa : 1.621
accused : -0.7154
fed : -0.7429
move : -0.7849
american : -0.7886
health : -0.7966
grows : -0.8256
concerns : -0.8273
valued : -0.8418
percent : -0.8496
stocks : -1.191

bankがbの分野の記事である確率を上げ、aereoがbの分野の記事である確率を下げる。絶対値の大きさはその強さ。aereoはテクノロジー企業なのにbではないのに違和感があったが、Aereo(実際には大文字で開始されている)が含まれる記事のカテゴリはすべてeかtであった。

58

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

def extract_X_y(data):
    X = []
    y = []
    data_train_sep = data.split('\n')
    for line in data_train_sep:
        if line == '':
            continue
        X += [''.join(line[2:])]
        y += [line.split('\t')[0]]
    return X, y
#train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割

with open('50_train.txt') as f_train:
    X_train, y_train = extract_X_y(f_train.read())
with open('50_valid.txt') as f_valid:
    X_valid , y_valid = extract_X_y(f_valid.read())
with open('50_test.txt') as f_test:
    X_test, y_test = extract_X_y(f_test.read())

print('X_train size =', len(X_train), len(X_train[0])) # 57文字程度の 10684要素

vec_count = CountVectorizer()
vec_count.fit(X_train)

train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換
valid_feature = vec_count.transform(X_valid).toarray()
test_feature  = vec_count.transform(X_test).toarray()

# npに変換
X_train_np = np.array(train_feature)
X_valid_np = np.array(valid_feature)
X_test_np = np.array(test_feature)

y_train_np = np.array(y_train)
y_valid_np = np.array(y_valid)
y_test_np = np.array(y_test)

# サイズチェック
print('X_train_np size =', X_train_np.shape)
print('y_train_np size =', y_train_np.shape)

# ロジスティクス回帰
c_logreg_para_array = [0.2, 0.5, 1, 2, 5]
acc_train = []
acc_test = []
iter_num = []
c_array_for_graph = []
for c_count, c_logreg_para in enumerate(c_logreg_para_array):
    model = LogisticRegression(random_state=None, 
                                max_iter=10000, 
                                C=c_logreg_para,
                                class_weight = 'balanced')
    model.fit(X_train_np, y_train_np)

    # 学習データでの結果
    pred_train = model.predict(X_train_np)# b e m t
    acc_train.append(accuracy_score(y_train_np, pred_train))


    # 評価データでの結果
    pred_test = model.predict(X_test_np)
    acc_test.append(accuracy_score(y_test_np, pred_test))
    iter_num.append(model.n_iter_)
    c_array_for_graph.append(c_logreg_para)

    fig = plt.figure()
    ax1 = fig.add_subplot(211)
    ax2 = fig.add_subplot(212)
    ax1.set_xscale('log')
    ax2 .set_xscale('log')
    ax1.plot(c_array_for_graph, acc_train, label = 'acc_train')
    ax1.plot(c_array_for_graph, acc_test, label = 'acc_test')
    ax2.plot(c_array_for_graph, iter_num, label = 'iter_num')
    ax1.legend()
    ax2.legend()

    plt.pause(1)
fig.savefig("58_balanced.pdf")

出力は
f:id:D_PLIUS:20200521143911j:plain

c_logreg_para_array = [0.2, 0.5, 1, 2, 5]
とし、LogisticRegressionのC=c_logreg_paraの値を変えている。
毎回modelを構築し、正解率を計算している。
上がtrainでの正解率とtestでの正解率。
下がiterationの回数である(題意には含まれない)。
Cを大きくするとiterationの回数が増え、正答率も上がる。
しかしC=1以上でもtestでの正解率はほとんど変わらない(trainもほとんど上がっていないが、間違えている数は減っている)。
20位まで上げてみたものの、そこまで過学習*3を感じることはできず。

59

各自いろいろ試してください。上記の90〜92%までしか筆者はしませんでした。

*1:ここで順は異なっている。4489 1231 4229 735個と確かめた際はreadme.txtのbtemの順で出力したが、confusion_matrixはアルファベット順のbemtで出力したからであろう。

*2:importを最初に書いてないのは使う場所に近づけたかっただけです。

*3:ここでは、trainの正答率が少々上がっているがtestの正答率が顕著に下がる傾向を過学習と読んだ