newsCorpora.csvは
"1 Fed official says weak data caused by weather" " should not slow taper http://www.latimes.com/business/money/la-fi-mo-federal-reserve-plosser-stimulus-economy-20140310" 0 "1312750.story\?track=rss Los Angeles Times b ddUyU0VZz0BRneMioxUPQVP6sIxvM www.latimes.com 1394470370698" "2 Fed's Charles Plosser sees high bar for change in pace of tapering http://www.livemint.com/Politics/H2EvwJSK2VE6OF7iK1g3PP/Feds-Charles-Plosser-sees-high-bar-for-change-in-pace-of-ta.html Livemint b ddUyU0VZz0BRneMioxUPQVP6sIxvM www.livemint.com 1394470371207" "3 US open: Stocks fall after Fed official hints at accelerated tapering http://www.ifamagazine.com/news/us-open-stocks-fall-after-fed-official-hints-at-accelerated-tapering-294436 IFA Magazine b ddUyU0VZz0BRneMioxUPQVP6sIxvM www.ifamagazine.com 1394470371550"
と続くデータ。
\tで区切られ、[1]が題名、[3]が情報源、[4]がラベルである。
50
PATH = 'newsCorpora.csv' import random pub5_data = [] with open(PATH) as r: data = r.read() lines = data.splitlines() for line in lines: publisher = line.split('\t')[3] if publisher == 'Reuters' or publisher == 'Huffington Post' or publisher == 'Businessweek' or publisher =='Contactmusic.com' or publisher == 'Daily Mail': pub5_data += [[line.split('\t')[4],line.split('\t')[1]]] pub5_data_shuffle = random.sample(pub5_data, len(pub5_data)) train = pub5_data_shuffle[:int(0.8*len(pub5_data_shuffle))] valid = pub5_data_shuffle[int(0.8*len(pub5_data_shuffle)):int(0.9*len(pub5_data_shuffle))] test = pub5_data_shuffle[int(0.9*len(pub5_data_shuffle)):] PATH_train = '50_train.txt' PATH_valid = '50_valid.txt' PATH_test = '50_test.txt' with open(PATH_train, 'w') as f_train: for line in train: f_train.write('\t'.join(line)+'\n') with open(PATH_valid, 'w') as f_valid: for line in valid: f_valid.write('\t'.join(line)+'\n') with open(PATH_test, 'w') as f_test: for line in test: f_test.write('\t'.join(line)+'\n')
random.sample関数でリストをランダムに並び替え、その長さの8割、9割の部分で区切り、それぞれをtrain, valid, testとした。
出力は例えば
e Woman who MADE UP entire bestselling holocaust memoir is forced to pay back ... b UK's FTSE 100 strengthens as Shire surges e Lady Gaga covers her entire head in a white furry mask to match her fringed ...
などが書かれたファイルである。
51
from sklearn.feature_extraction.text import CountVectorizer import numpy as np #train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割 def extract_X_y(data): X = [] y = [] data_train_sep = data.splitlines() for line in data_train_sep: y += [line.split('\t')[0]] X += [''.join(line[2:])] return X, y PATH_train = 'train.txt' with open(PATH_train) as f_train: data = f_train.read() X_train, y_train = extract_X_y(data) PATH_valid = 'valid.txt' with open(PATH_valid) as f_valid: data = f_valid.read() X_valid , y_valid = extract_X_y(data) PATH_test = 'test.txt' with open(PATH_test) as f_test: data = f_test.read() X_test, y_test = extract_X_y(data) # yを書き出す PATH_y_train = 'y_train.txt' with open(PATH_y_train, 'w') as f_y_train: f_y_train.write('\n'.join(y_train)) PATH_y_valid = 'y_valid.txt' with open(PATH_y_valid, 'w') as f_y_valid: f_y_valid.write('\n'.join(y_valid)) PATH_y_test = 'y_test.txt' with open(PATH_y_test , 'w') as f_y_test: f_y_test.write('\n'.join(y_test)) vec_count = CountVectorizer() vec_count.fit(X_train) X = vec_count.transform(X_train) Y = vec_count.transform(X_valid) Z = vec_count.transform(X_test) '''0文目は疎行列表記で (0, 581) 1 (0, 1549) 1 (0, 9871) 1 (0, 9897) 1 (0, 10088) 1 (0, 10916) 1 (0, 12664) 1 となっている''' #一般的な行列の形へと変換 train_feature = X.toarray() valid_feature = Y.toarray() test_feature = Z.toarray() train_feature = train_feature.tolist() PATH_train_feature = 'train.feature.txt' with open(PATH_train_feature, 'w') as w1: for line in train_feature: w1.write(' '.join([str(char) for char in line])) PATH_valid_feature = 'valid.feature.txt' with open(PATH_valid_feature, 'w') as w2: for line in valid_feature: w2.write(' '.join([str(char) for char in line])) PATH_test_feature = 'test.feature.txt' with open(PATH_test_feature, 'w') as w3: for line in test_feature: w3.write(' '.join([str(char) for char in line]))
train内の語彙を使って、one-hotベクトルを作った。
サイズは大きいが、比較的簡単なためこの手法をとった。
52
from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import numpy as np #特徴量のラベルxと答えのラベルyに分割 def extract_X_y(data): X = [] y = [] data_train_sep = data.split('\n') for line in data_train_sep: if line == '': continue y += [line.split('\t')[0]] X += [''.join(line[2:])] return X, y PATH_train = 'train.txt' with open(PATH_train) as f_train: data = f_train.read() X_train, y_train = extract_X_y(data) vec_count = CountVectorizer() vec_count.fit(X_train) X = vec_count.transform(X_train) train_feature = X.toarray() #一般的な行列の形へと変換 # npに変換 X_train_np = np.array(train_feature) y_train_np = np.array(y_train) # ロジスティクス回帰 model = LogisticRegression(random_state=None, max_iter=1000) model.fit(X_train_np, y_train_np)
from sklearn.linear_model import LogisticRegression
の
LogisticRegression
を用いた。
ただの配列ではなく、numpyのarrayを入れる必要がある。
LogisticRegression()でモデルを作成し、
そのモデルに.fit(特徴量, ラベル)を渡して学習させる。
回帰をしているだけで、何も出力しない。
53
from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import numpy as np def extract_X_y(data): X = [] y = [] data_train_sep = data.split('\n') for line in data_train_sep: if line == '': continue y += [line.split('\t')[0]] X += [''.join(line[2:])] return X, y #train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割 with open('train.txt') as f_train: X_train, y_train = extract_X_y(f_train.read()) with open('valid.txt') as f_valid: X_valid , y_valid = extract_X_y(f_valid.read()) with open('test.txt') as f_test: X_test, y_test = extract_X_y(f_test.read()) vec_count = CountVectorizer() vec_count.fit(X_train) train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換 valid_feature = vec_count.transform(X_valid).toarray() test_feature = vec_count.transform(X_test).toarray() # npに変換 X_train_np = np.array(train_feature) y_train_np = np.array(y_train) # ロジスティクス回帰 model = LogisticRegression(random_state=None, max_iter=10000) model.fit(X_train_np, y_train_np) pred_train = model.predict(X_train_np) pred_train_proba = model.predict_proba(X_train_np) # b e m t print('pred_train: ', pred_train) print('pred_train_proba: ', pred_train_proba)
出力は例えば
pred_train: ['b' 't' 'b' ... 'm' 'e' 'b'] pred_train_proba: [[8.89689407e-01 1.85913832e-02 2.63040290e-02 6.54151808e-02] [1.34468354e-01 9.68217362e-02 5.51499875e-02 7.13559922e-01] [9.99953355e-01 6.10473669e-06 2.17606365e-05 1.87801207e-05] ... [1.29665708e-02 2.69734697e-02 7.49512905e-01 2.10547055e-01] [4.02224038e-02 9.20044044e-01 1.79165884e-02 2.18169643e-02] [8.70917324e-01 7.66668727e-02 9.93797901e-03 4.24778239e-02]]
学習したmodelに対して、.predict(特徴量)とするとロジスティクス回帰での判別結果をnumpy.arrayで得られる。
同様に.predict_proba(特徴量)とすると、判定の確率を得られる。
表示前に
np.set_printoptions(suppress=True, precision=3)
とすると、
pred_train: ['b' 't' 'b' ... 'm' 'e' 'b'] pred_train_proba: [[0.89 0.019 0.026 0.065] [0.134 0.097 0.055 0.714] [1. 0. 0. 0. ] ... [0.013 0.027 0.75 0.211] [0.04 0.92 0.018 0.022] [0.871 0.077 0.01 0.042]]
となり、見やすい。
54
from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import numpy as np np.set_printoptions(suppress=True, precision=3, formatter={'float': '{:.2f}'.format}) def extract_X_y(data): X = [] y = [] data_train_sep = data.split('\n') for line in data_train_sep: if line == '': continue y += [line.split('\t')[0]] X += [''.join(line[2:])] return X, y #train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割 with open('train.txt') as f_train: X_train, y_train = extract_X_y(f_train.read()) with open('valid.txt') as f_valid: X_valid , y_valid = extract_X_y(f_valid.read()) with open('test.txt') as f_test: X_test, y_test = extract_X_y(f_test.read()) vec_count = CountVectorizer() vec_count.fit(X_train) train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換 valid_feature = vec_count.transform(X_valid).toarray() test_feature = vec_count.transform(X_test).toarray() # npに変換 X_train_np = np.array(train_feature) X_valid_np = np.array(valid_feature) X_test_np = np.array(test_feature) y_train_np = np.array(y_train) y_valid_np = np.array(y_valid) y_test_np = np.array(y_test) # ロジスティクス回帰 model = LogisticRegression(random_state=None, max_iter=10000) model.fit(X_train_np, y_train_np) pred_train = model.predict(X_train_np)# b e m t acc_train = accuracy_score(y_train_np, pred_train) print('acc_train: ', acc_train) pred_valid = model.predict(X_valid_np) acc_valid = accuracy_score(y_valid_np, pred_valid) print('acc_valid: ', acc_valid) pred_test = model.predict(X_test_np) acc_test = accuracy_score(y_test_np, pred_test) print('acc_test: ', acc_test)
出力は例えば
acc_train: 0.9960688880569075 acc_valid: 0.905688622754491 acc_test: 0.9169161676646707
from sklearn.metrics import accuracy_score
とし
accuracy_score(正答, 回答)とすることで、正答率を計算。
one-hotだけで9割ほどの正解率を実現した。
55
from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix import numpy as np def extract_X_y(data): X = [] y = [] data_train_sep = data.split('\n') for line in data_train_sep: if line == '': continue y += [line.split('\t')[0]] X += [''.join(line[2:])] return X, y #train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割 with open('train.txt') as f_train: X_train, y_train = extract_X_y(f_train.read()) with open('valid.txt') as f_valid: X_valid , y_valid = extract_X_y(f_valid.read()) with open('test.txt') as f_test: X_test, y_test = extract_X_y(f_test.read()) print('X_train size =', len(X_train), len(X_train[0])) # 57文字程度の 10684要素 vec_count = CountVectorizer() vec_count.fit(X_train) train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換 valid_feature = vec_count.transform(X_valid).toarray() test_feature = vec_count.transform(X_test).toarray() # npに変換 X_train_np = np.array(train_feature) X_valid_np = np.array(valid_feature) X_test_np = np.array(test_feature) y_train_np = np.array(y_train) y_valid_np = np.array(y_valid) y_test_np = np.array(y_test) # ロジスティクス回帰 model = LogisticRegression(random_state=None, max_iter=10000) model.fit(X_train_np, y_train_np) # 学習データでの結果 pred_train = model.predict(X_train_np) Conf_Mat_train = confusion_matrix(y_train_np, pred_train) print('Conf_Mat_train:\n',Conf_Mat_train) # 評価データ pred_test = model.predict(X_test_np) Conf_Mat_test = confusion_matrix(y_test_np, pred_test) print('Conf_Mat_test:\n',Conf_Mat_test)
出力は例えば
[[4477 2 0 10] [ 8 4218 1 2] [ 2 2 731 0] [ 12 2 0 1217]] Conf_Mat_test: [[532 20 4 14] [ 7 530 0 2] [ 11 11 45 7] [ 31 14 3 105]]
対角線上に正答の数が並んでいる。
例えばConf_Mat_trainの10は、0番目のカテゴリを3番目のカテゴリだと(誤)判定した数である。
問題50において分割した際、それぞれ4489 1231 4229 735個であったことからも確かめられる。*1
56
from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix import numpy as np def extract_X_y(data): X = [] y = [] data_train_sep = data.split('\n') for line in data_train_sep: if line == '': continue y += [line.split('\t')[0]] X += [''.join(line[2:])] return X, y #train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割 with open('50_train.txt') as f_train: X_train, y_train = extract_X_y(f_train.read()) with open('50_valid.txt') as f_valid: X_valid , y_valid = extract_X_y(f_valid.read()) with open('50_test.txt') as f_test: X_test, y_test = extract_X_y(f_test.read()) print('X_train size =', len(X_train), len(X_train[0])) # 57文字程度の 10684要素 vec_count = CountVectorizer() vec_count.fit(X_train) train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換 valid_feature = vec_count.transform(X_valid).toarray() test_feature = vec_count.transform(X_test).toarray() # npに変換 X_train_np = np.array(train_feature) X_valid_np = np.array(valid_feature) X_test_np = np.array(test_feature) y_train_np = np.array(y_train) y_valid_np = np.array(y_valid) y_test_np = np.array(y_test) # サイズチェック print('X_train_np size =', X_train_np.shape) print('y_train_np size =', y_train_np.shape) # ロジスティクス回帰 model = LogisticRegression(random_state=None, max_iter=10000) model.fit(X_train_np, y_train_np) # 学習データでの結果 pred_train = model.predict(X_train_np)# b e m t acc_train = accuracy_score(y_train_np, pred_train) # 評価データ pred_test = model.predict(X_test_np) acc_test = accuracy_score(y_test_np, pred_test) conf_mat_test = confusion_matrix(y_test_np, pred_test) print('Conf_Mat_test:\n',conf_mat_test)# Conf_Mat_test[m][n]は、ラベルnをmだと予想した意 # 何段目にあるか?がどう予想したか。 #何列目か?が真の値 acc_test_by_category = [np.sum([conf_mat_test[i][j] for i in range(4) for j in range(4) if i == k and j == k or i != k and j != k])/np.sum(conf_mat_test) for k in range(4)] prec_test_by_category = [conf_mat_test[i][i]/np.sum(conf_mat_test, axis=0)[i] for i in range(4)] reca_test_by_category = [conf_mat_test[i][i]/np.sum(conf_mat_test, axis=1)[i] for i in range(4)] # 計算式の簡単のため一度変数名を変える p = prec_test_by_category.copy() r = reca_test_by_category.copy() f1_test_by_category = [2*p[i]*r[i] / (p[i]+r[i]) for i in range(4)] print(prec_test_by_category) print(reca_test_by_category) print(f1_test_by_category) acc_macro = np.mean(acc_test_by_category) prec_macro= np.mean(prec_test_by_category) reca_macro= np.mean(reca_test_by_category) f1_macro = np.mean(f1_test_by_category) print('acc_macro: ' ,acc_macro) print('prec_macro: ',prec_macro) print('reca_macro: ',reca_macro) print('f1_macro: ', f1_macro) acc_micro = np.mean(np.sum([conf_mat_test[i][i] for i in range(4)])/np.sum(conf_mat_test)) print('acc_micro = prec_micro = reca_micro: ', acc_micro) print('---') from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score print (precision_score( y_test_np, pred_test, average=None)) print (recall_score( y_test_np, pred_test, average=None)) print (f1_score( y_test_np, pred_test, average=None)) print (precision_score( y_test_np, pred_test, average='micro' )) print (recall_score( y_test_np, pred_test, average='micro' )) print (f1_score( y_test_np, pred_test, average='micro' )) print (precision_score( y_test_np, pred_test, average='macro' )) print (recall_score( y_test_np, pred_test, average='macro' )) print (f1_score( y_test_np, pred_test, average='macro' ))
出力例は
[0.9156626506024096, 0.9217391304347826, 0.8653846153846154, 0.8203125] [0.9333333333333333, 0.9833024118738405, 0.6081081081081081, 0.6862745098039216] [0.9244135534317984, 0.9515260323159784, 0.7142857142857144, 0.7473309608540926] acc_macro: 0.9535928143712575 prec_macro: 0.8807747241054519 reca_macro: 0.8027545907798008 f1_macro: 0.8343890652218959 acc_micro = prec_micro = reca_micro: 0.907185628742515 --- [0.91566265 0.92173913 0.86538462 0.8203125 ] [0.93333333 0.98330241 0.60810811 0.68627451] [0.92441355 0.95152603 0.71428571 0.74733096] 0.907185628742515 0.907185628742515 0.907185628742515 0.8807747241054519 0.8027545907798008 0.8343890652218959
acc_test_by_categoryをリスト内包表記で書いた。
計算式の簡単のため一度変数名を変えたが、あまりよくなさそうなので多用しない。
acc_micro == prec_micro == reca_microであることで理解に自信がなくなるも正しい。
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
にすべてあったのでリスト内包表記を書く必要は無かった。
用意されたものを使うのが無難そう。*2
57
from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import numpy as np def extract_X_y(data): X = [] y = [] data_train_sep = data.split('\n') for line in data_train_sep: if line == '': continue y += [line.split('\t')[0]] X += [''.join(line[2:])] return X, y #train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割 with open('50_train.txt') as f_train: X_train, y_train = extract_X_y(f_train.read()) with open('50_valid.txt') as f_valid: X_valid , y_valid = extract_X_y(f_valid.read()) with open('50_test.txt') as f_test: X_test, y_test = extract_X_y(f_test.read()) vec_count = CountVectorizer() vec_count.fit(X_train) train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換 valid_feature = vec_count.transform(X_valid).toarray() test_feature = vec_count.transform(X_test).toarray() # npに変換 X_train_np = np.array(train_feature) X_valid_np = np.array(valid_feature) X_test_np = np.array(test_feature) y_train_np = np.array(y_train) y_valid_np = np.array(y_valid) y_test_np = np.array(y_test) # ロジスティクス回帰 model = LogisticRegression(random_state=None, max_iter=10000) model.fit(X_train_np, y_train_np) # 学習データでの結果 pred_train = model.predict(X_train_np)# b e m t acc_train = accuracy_score(y_train_np, pred_train) # 評価データでの結果 pred_test = model.predict(X_test_np) acc_test = accuracy_score(y_test_np, pred_test) for i, c_name in enumerate(['business', 'entertainment', 'medical', 'technology']): print('---typical word in ' + c_name + ' category---') index_list = np.argsort(model.coef_[i]) sorted_coef_ = sorted(model.coef_[i]) for i in list(range(-1, -11, -1))+list(range(9, -1, -1)): print(vec_count.get_feature_names()[index_list[i]], f': {sorted_coef_[i]:.4g}')
出力は例えば
---typical word in business category--- bank : 1.953 ecb : 1.713 fed : 1.707 yellen : 1.596 china : 1.583 dollar : 1.541 obamacare : 1.523 ukraine : 1.512 mcdonald : 1.457 euro : 1.45 star : -0.9617 cap : -0.9827 heartbleed : -0.9846 subscription : -1.026 nintendo : -1.058 she : -1.075 twitch : -1.126 ebola : -1.149 activision : -1.392 aereo : -1.505 ---typical word in entertainment category--- kardashian : 1.573 chris : 1.548 thrones : 1.466 paul : 1.402 transformers : 1.393 miley : 1.383 movie : 1.378 cyrus : 1.315 film : 1.303 music : 1.271 climate : -1.037 study : -1.063 ebola : -1.1 data : -1.112 billion : -1.148 gm : -1.234 china : -1.252 facebook : -1.277 microsoft : -1.279 google : -1.531 ---typical word in medical category--- ebola : 2.76 fda : 2.239 cancer : 2.108 mers : 1.904 study : 1.746 medical : 1.666 cases : 1.645 doctors : 1.576 drug : 1.546 cigarettes : 1.545 cars : -0.5223 mobile : -0.5231 netflix : -0.5927 bank : -0.6168 climate : -0.6489 twitter : -0.732 apple : -0.748 facebook : -0.75 dimon : -0.7779 gm : -0.8259 ---typical word in technology category--- facebook : 2.757 google : 2.563 microsoft : 2.501 apple : 2.489 climate : 2.362 heartbleed : 1.906 fcc : 1.707 gm : 1.691 tesla : 1.655 nasa : 1.621 accused : -0.7154 fed : -0.7429 move : -0.7849 american : -0.7886 health : -0.7966 grows : -0.8256 concerns : -0.8273 valued : -0.8418 percent : -0.8496 stocks : -1.191
bankがbの分野の記事である確率を上げ、aereoがbの分野の記事である確率を下げる。絶対値の大きさはその強さ。aereoはテクノロジー企業なのにbではないのに違和感があったが、Aereo(実際には大文字で開始されている)が含まれる記事のカテゴリはすべてeかtであった。
58
from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix import numpy as np import matplotlib.pyplot as plt def extract_X_y(data): X = [] y = [] data_train_sep = data.split('\n') for line in data_train_sep: if line == '': continue X += [''.join(line[2:])] y += [line.split('\t')[0]] return X, y #train, valid, testからそれぞれ情報を抽出し、特徴量のラベルxと答えのラベルyに分割 with open('50_train.txt') as f_train: X_train, y_train = extract_X_y(f_train.read()) with open('50_valid.txt') as f_valid: X_valid , y_valid = extract_X_y(f_valid.read()) with open('50_test.txt') as f_test: X_test, y_test = extract_X_y(f_test.read()) print('X_train size =', len(X_train), len(X_train[0])) # 57文字程度の 10684要素 vec_count = CountVectorizer() vec_count.fit(X_train) train_feature = vec_count.transform(X_train).toarray() #これで一般的な行列の形へと変換 valid_feature = vec_count.transform(X_valid).toarray() test_feature = vec_count.transform(X_test).toarray() # npに変換 X_train_np = np.array(train_feature) X_valid_np = np.array(valid_feature) X_test_np = np.array(test_feature) y_train_np = np.array(y_train) y_valid_np = np.array(y_valid) y_test_np = np.array(y_test) # サイズチェック print('X_train_np size =', X_train_np.shape) print('y_train_np size =', y_train_np.shape) # ロジスティクス回帰 c_logreg_para_array = [0.2, 0.5, 1, 2, 5] acc_train = [] acc_test = [] iter_num = [] c_array_for_graph = [] for c_count, c_logreg_para in enumerate(c_logreg_para_array): model = LogisticRegression(random_state=None, max_iter=10000, C=c_logreg_para, class_weight = 'balanced') model.fit(X_train_np, y_train_np) # 学習データでの結果 pred_train = model.predict(X_train_np)# b e m t acc_train.append(accuracy_score(y_train_np, pred_train)) # 評価データでの結果 pred_test = model.predict(X_test_np) acc_test.append(accuracy_score(y_test_np, pred_test)) iter_num.append(model.n_iter_) c_array_for_graph.append(c_logreg_para) fig = plt.figure() ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) ax1.set_xscale('log') ax2 .set_xscale('log') ax1.plot(c_array_for_graph, acc_train, label = 'acc_train') ax1.plot(c_array_for_graph, acc_test, label = 'acc_test') ax2.plot(c_array_for_graph, iter_num, label = 'iter_num') ax1.legend() ax2.legend() plt.pause(1) fig.savefig("58_balanced.pdf")
出力は
c_logreg_para_array = [0.2, 0.5, 1, 2, 5]
とし、LogisticRegressionのC=c_logreg_paraの値を変えている。
毎回modelを構築し、正解率を計算している。
上がtrainでの正解率とtestでの正解率。
下がiterationの回数である(題意には含まれない)。
Cを大きくするとiterationの回数が増え、正答率も上がる。
しかしC=1以上でもtestでの正解率はほとんど変わらない(trainもほとんど上がっていないが、間違えている数は減っている)。
20位まで上げてみたものの、そこまで過学習*3を感じることはできず。
59
各自いろいろ試してください。上記の90〜92%までしか筆者はしませんでした。