# IPython log file

get_ipython().magic(u'logstart lesson7.log')
4 + 4
5 *2343
5**2
l = [ 1,2,3,5]
l
len(l)
l[0]
l[1]
x = 3
x
d = {}
d['key'] = 7
d
def mean_absolute_error( x, y):
return 0
def mean_absolute_error( x, y):
    return x-y / len(x)
import math
def mean_absolute_error( x, y):
    return math.abs(x-y) / len(x)
mean_absolute_error(5,3)
def mean_absolute_error( x, y):
    return math.fabs(x-y)/len(x)
mean_absolute_error(5,3)
mean_absolute_error([5],[3])
for i in range(1,10):
    print i
    
for i in range(10):
    print i
    
for i in range(1,10,2):
    print i
    
def mean_absolute_error( x, y):
    for (xi, yi) in zip(x,y):
        print xi,yi
        
mean_absolute_error([5],[3])
def mean_absolute_error( x, y):
   total = 0 
   for (xi, yi) in zip(x,y):
       total = total + math.fabs(xi -yi)
   return total/len(x)
mean_absolute_error([5],[3])
def mean_squared_error(x,y):
    total = 0
    for (xi, yi) in zip(x,y):
        total += (xi - yi)**2
    return total/len(x)
mean_squared_error([0,0],[2,3])
def mean_squared_error(x,y):
    total = 0.0
    for (xi, yi) in zip(x,y):
        total += (xi -yi)**2
    return total/len(x)
mean_squared_error([0,0],[2,3])
x2 =  [ i for i in x ] 
x = [1,2,3,4]
x2 =  [ i for i in x ]
x2
x2 =  [ i**2 for i in x ]
x2
def mean_squared_error2(x,y):
    new_list = [(xi -yi)**2 for (xi, yi) in zip(x,y)]
    total = sum(new_list)
    return total/len(x)
mean_squared_error2([0,0],[2,3])
def mean_squared_error2(x,y):
    new_list = [(xi -yi)**2 for (xi, yi) in zip(x,y)]
    total = float(sum(new_list))
    return total/len(x)
mean_squared_error2([0,0],[2,3])
t = 5
u = t
t = 7
t
u
t = [5]
u = t
t[0] = 7
t
u
import sklearn
from sklearn.datasets import load_iris
iris = load_iris()
iris.data
iris.target
from  sklearn import neighbors
from  sklearn import neighbors
model = neighbors.KNearestNeighbors()
model = neighbors.KNeighborsClassifier()
model.fit(iris.data, iris.target)
get_ipython().magic(u'pinfo neighbors.KNeighborsClassifier')
model = model.fit(iris.data, iris.target)
print model
predictions = model.predict(iris.target)
predictions = model.predict(iris.data)
prediction[1:10]
predictions[1:10]
model.score(iris.data, iris.target)
from sklearn.cross_validation import cross_val_score
get_ipython().magic(u'pinfo cross_val_score')
cross_val_score(model, iris.data, iris.target)
get_ipython().magic(u'pinfo cross_val_score')
cross_val_score(model, iris.data, iris.target, cv=10)
from sklearn import linear_model
model = linear_model.LogisticRegression()
model = model.fit(iris.data, iris.target)
model.score(iris.data, iris.target)
cross_val_score(model, iris.data, iris.target, cv=10)
from sklearn.cross_validation import train_test_split
get_ipython().magic(u'pinfo train_test_split')
from sklearn.datasets import load_files
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
twenty_train_subset = load_files('Downloads/20news-bydate-train/', categories=categories, charset='latin-1')
twenty_train_subset = load_files('20news-bydate-train/', categories=categories, charset='latin-1')
twenty_test_subset = load_files('20news-bydate-test/', categories=categories, charset='latin-1')
twenty_train_subset[0]
twenty_train_subset.data[0]
twenty_train_subset.target[0]
from sklearn.feature_extraction.text import CountVectorizer
get_ipython().magic(u'pinfo CountVectorizer')
v = CountVectorizer()
text_features = v.fit_transform(twenty_train_subset)
get_ipython().magic(u'pinfo CountVectorizer')
text_features = v.fit_transform(twenty_train_subset.data)
text_features
text_features.toarray()
v.feature_names()
get_ipython().magic(u'pinfo CountVectorizer')
v.get_feature_names()
v.get_feature_names()[1:10]
text_features[4][5]
text_features.toarray()[4][5]
v = CountVectorizer(max_features=100)
text_features = v.fit_transform(twenty_train_subset.data)
v.get_feature_names()[1:10]
v = CountVectorizer(max_features=100, stopwords='english')
v = CountVectorizer(max_features=100, stopword='english')
get_ipython().magic(u'pinfo CountVectorizer')
v = CountVectorizer(max_features=100, stop_words='english')
text_features = v.fit_transform(twenty_train_subset.data)
v.get_feature_names()[1:10]
text_features[0]
text_features.toarray()[0]
from sklearn.naivebayes import MultinomialNB
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(text_features, twenty_train_subset.target)
cross_val_score(model, text_features, twenty_train_subset.target)
v = CountVectorizer( stop_words='english')
text_features = v.fit_transform(twenty_train_subset.data)
cross_val_score(model, text_features, twenty_train_subset.target)
cross_val_score(linear_model.LogisticRegression)(, text_features, twenty_train_subset.target)
cross_val_score(linear_model.LogisticRegression(), text_features, twenty_train_subset.target)
cross_val_score(linear_model.LogisticRegression(), text_features, twenty_train_subset.target, cv=5)
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]