1.1 输入信息

输入分为三部分:

l train_data.txt为已经做好特征工程处理的本地训练集文件。每一行为一条数据记录,以逗号分开。最后一列为类别(二分类),前面的列为特征值。

l test_data.txt 为需要预测的本地测试集文件。特征数和训练集一致。不含类别信息。

示例代码为准确率和性能待优化的参考代码,支持的语言分别为C++/Python/JAVA

 

answer.txttest_data.txt的二分类结果,用于练习的时候使用。

1.1 输出信息

输出信息为一个文件result.txt,按行顺序放置测试集记录的预测结果,每一行代表一条训练数据的二分类结果。

1.2 限制条件

l 选手拿到的训练集和测试集并不是最终判题用的数据。

示例代码的算法实现为LR(逻辑回归),选手可以将其改为其它的机器学习算法,但程序中定义的输入输出文件路径不能改。

l 不允许使用外部机器学习库。

示例代码(LR,逻辑回归)

  1 import math
  2 import datetime
  3 import sys
  4 import numpy as np
  5 
  6 
  7 class LR:
  8     def __init__(self, train_file_name, test_file_name, predict_result_file_name):
  9         self.train_file = train_file_name
 10         self.predict_file = test_file_name
 11         self.predict_result_file = predict_result_file_name
 12         self.max_iters = 760
 13         self.rate = 0.1
 14         self.feats = []
 15         self.labels = []
 16         self.feats_test = []
 17         self.labels_predict = []
 18         self.param_num = 0
 19         self.weight = []
 20 
 21     def loadDataSet(self, file_name, label_existed_flag):
 22         feats = []
 23         labels = []
 24         fr = open(file_name)
 25         lines = fr.readlines()
 26         for line in lines:
 27             temp = []
 28             allInfo = line.strip().split(',')
 29             dims = len(allInfo)
 30             if label_existed_flag == 1:
 31                 for index in range(dims-1):
 32                     temp.append(float(allInfo[index]))
 33                 feats.append(temp)
 34                 labels.append(float(allInfo[dims-1]))
 35             else:
 36                 for index in range(dims):
 37                     temp.append(float(allInfo[index]))
 38                 feats.append(temp)
 39         fr.close()
 40         feats = np.array(feats)
 41         labels = np.array(labels)
 42         return feats, labels
 43 
 44     def loadTrainData(self):
 45         self.feats, self.labels = self.loadDataSet(self.train_file, 1)
 46 
 47     def loadTestData(self):
 48         self.feats_test, self.labels_predict = self.loadDataSet(
 49             self.predict_file, 0)
 50 
 51     def savePredictResult(self):
 52         print(self.labels_predict)
 53         f = open(self.predict_result_file, 'w')
 54         for i in range(len(self.labels_predict)):
 55             f.write(str(self.labels_predict[i])+"\n")
 56         f.close()
 57 
 58     def sigmod(self, x):
 59         return 1/(1+np.exp(-x))
 60 
 61     def printInfo(self):
 62         print(self.train_file)
 63         print(self.predict_file)
 64         print(self.predict_result_file)
 65         print(self.feats)
 66         print(self.labels)
 67         print(self.feats_test)
 68         print(self.labels_predict)
 69 
 70     def initParams(self):
 71         self.weight = np.ones((self.param_num,), dtype=np.float)
 72 
 73     def compute(self, recNum, param_num, feats, w):
 74         return self.sigmod(np.dot(feats, w))
 75 
 76     def error_rate(self, recNum, label, preval):
 77         return np.power(label - preval, 2).sum()
 78 
 79     def predict(self):
 80         self.loadTestData()
 81         preval = self.compute(len(self.feats_test),
 82                               self.param_num, self.feats_test, self.weight)
 83         self.labels_predict = (preval+0.5).astype(np.int)
 84         self.savePredictResult()
 85 
 86     def train(self):
 87         self.loadTrainData()
 88         recNum = len(self.feats)
 89         self.param_num = len(self.feats[0])
 90         #print(self.param_num)
 91         self.initParams()
 92         ISOTIMEFORMAT = '%Y-%m-%d %H:%M:%S,f'
 93         for i in range(self.max_iters):
 94             preval = self.compute(recNum, self.param_num,
 95                                   self.feats, self.weight)
 96             sum_err = self.error_rate(recNum, self.labels, preval)
 97             if i%30 == 0:
 98                 print("Iters:" + str(i) + " error:" + str(sum_err))
 99                 theTime = datetime.datetime.now().strftime(ISOTIMEFORMAT)
100                 print(theTime)
101             err = self.labels - preval
102             delt_w = np.dot(self.feats.T, err)
103             delt_w /= recNum
104             self.weight += self.rate*delt_w
105 
106 
107 def print_help_and_exit():
108     print("usage:python3 main.py train_data.txt test_data.txt predict.txt [debug]")
109     sys.exit(-1)
110 
111 
112 def parse_args():
113     debug = False
114     if len(sys.argv) == 2:
115         if sys.argv[1] == 'debug':
116             print("test mode")
117             debug = True
118         else:
119             print_help_and_exit()
120     return debug
121 
122 
123 if __name__ == "__main__":
124     #debug = parse_args()
125     train_file =  "./data/train_data.txt"
126     test_file = "./data/test_data.txt"
127     predict_file = "./data/result.txt"
128     lr = LR(train_file, test_file, predict_file)
129     lr.train()
130     lr.predict()
131     debug=True
132 
133     if debug:
134         answer_file ="./data/answer.txt"
135         f_a = open(answer_file, 'r')
136         f_p = open(predict_file, 'r')
137         a = []
138         p = []
139         lines = f_a.readlines()
140         for line in lines:
141             a.append(int(float(line.strip())))
142         f_a.close()
143 
144         lines = f_p.readlines()
145         for line in lines:
146             p.append(int(float(line.strip())))
147         f_p.close()
148 
149         print("answer lines:%d" % (len(a)))
150         print("predict lines:%d" % (len(p)))
151 
152         errline = 0
153         for i in range(len(a)):
154             if a[i] != p[i]:
155                 errline += 1
156 
157         accuracy = (len(a)-errline)/len(a)
158         print("accuracy:%f" %(accuracy))