some Python (Ng)

------------------------------------------------------------- Ng ----------------------------------------------

#matrix

a = np.matrix([[1,2,3],
             [2,3,4]])
print(a)
print()

# 'and'  and logical end are different eg
print ('bhai' and 'behen') # prints the second one
print ('bhai' or 'behen') # prints the first one
# print ('bhai' | 'behen') # error
print (1010 | 101) # converts both to binary and does 'or' or 'and'(here they appear binary but they are not)
print(1010 and 123) # same the second one it will print
print()
#formatting
print('the value of pi is : {:0.2f}'.format(np.pi))
print()
# b = np.atleast_2d(a) --- try this
b = np.matrix([1,2,3])
print(b)
print(b.shape)
c = b.T
print(c)
print(c.shape)
# matrix vs array .. matrix are 2d arrays this is subclass of ndarray
# a*b in matrix is equal to np.dot(a,b) in arrays
print()
d = np.matrix('1; 2; 3')
print(d)
print(d.dtype)
#also read .mat, asmatrix, asarray
print(np.ones((2,3)))
print()
print(np.random.randint(0, 10,(3, 4)))
print()
print(np.random.random((3, 4))) #Return random floats in the half-open interval [0.0, 1.0)
print(  )
print(np.random.randn(5)) #Return a sample (or samples) from the “standard normal” distribution.

w = -6 + mt.sqrt(10) * (np.random.randn((10000)))

#plot = plt.hist(w,50)
#plt.show(plot)

print()
print(np.eye(3))

#print(help(np.random))

########## moving data around ----------------


a = np.array([[1,2,3],
             [2,3,4]])
print(a)
print()

b = np.matrix('1, 2, 3 ; 2, 3, 4')
print(b)

print(len(b[1]))

c = [1,2,3]
d = [2,3,4]
print()
e = np.array([c, d])
print(e)
print()
f = np.matrix([c, d])
print(f)
print('---')
a = np.array([1,2,3])
print(b.size)
print()
#equivalent to ls command
#ls = [f for f in os.listdir('.') if os.path.isfile(f)]
ls = [os.listdir(os.curdir)]
#or ls = [os.listdir(.)]
print(ls)
#pwd present
print(os.getcwd())

#---------------------

#load features
featuresX = np.loadtxt('featuresX.dat',dtype=int,delimiter=',')
featuresX = np.matrix(featuresX)
print(featuresX)
#load price
priceY = np.loadtxt('priceY.dat',dtype=int)
priceY = np.matrix(priceY).T
print()
print(priceY)
print()
#print(featuresX[1,0])
print(featuresX.shape)
print(priceY.shape)
print()
#print(priceY[:10])
#v = priceY[:10]
#print(v)
#saving data in file
#np.savetxt('v.dat',v,fmt='%.0f')

#print 2nd and 4th row -- 0 index matrix it is
print(featuresX[[1, 3],:])


a = np.matrix('1, 2 ; 2, 3 ; 3, 4')
print(a)
#changing second column of the above matrix
a[:,1] = np.matrix('10 ; 11 ; 12')
print()
print(a)
print()
#add one more columns
a = np.append(a,np.matrix('100 ; 231 ; 112'),axis=1)
print(a)
#flatting a matrix
print(a.flatten())

c = np.matrix('1, 2 ; 3, 4')
d = np.matrix('11, 12 ; 13, 14')
# appending matrices side by side
e = np.hstack((c, d))
print()
print(e)


#---------- basic linear regression ... house data

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import csv
import math
from sklearn import linear_model
#from sklearn.cross_validation import train_test_split

# df = pd.read_csv(r'C:\Users\avishek\Downloads\Datasets\adult.data.csv', header=None)
# #print(df.head())
# df.columns =['age','workclass','fnlwgt','education','education-num',
#                                'marital-status','occupation','relationship','race','sex',
#                                 'capital-gain','capital-loss','hours-per-week','native-country','last-column']
# #print(df[df['workclass'] == 'State-gov'].head())
# #print(df.head())
#
# #print(df['workclass']==' State-gov')
# print(df['sex'].unique())

featuresX = np.loadtxt('featuresX.dat', dtype=int, delimiter=',')
priceY = np.loadtxt('priceY.dat',dtype=int)
priceY = np.matrix(priceY).T

regr = linear_model.LinearRegression()
regr.fit(featuresX,priceY)
# plt.scatter(featuresX[:,0], priceY,  color='black')
# plt.show()
# plt.scatter(featuresX[:,1], priceY,  color='black')
# plt.show()
print(regr.predict((2104,3)))
print()
print(regr.predict([1416,2]))

#--- same code using excel

df = pd.read_excel(r'C:\Users\avishek\Downloads\housing.xlsx',sheetname='Sheet1',header=None)
#print(df)
F_X = np.matrix(df[[0,1]])
#print(F_X)
P_Y = np.matrix(df[2]).T

regr = linear_model.LinearRegression()
regr.fit(X=F_X,y=P_Y)

print(regr.predict([1203, 3]))

#also we cant use accuracy_score with regression problem

#----------- basic gradient descent ------------

#the below code is in a seperate .py file named graddesc
import numpy as np
def gradDesc(X, y, theta):
    m = X.shape[0] #no of training examples
    prediction = np.dot(X,theta) # x into theta
    sqErrors = np.square(prediction - y)
    return 0.5/m*np.sum(sqErrors,axis=0)


import graddesc #importing above file

featuresX = np.loadtxt('featuresX.dat',dtype=int,delimiter=',')
featuresX = np.matrix(featuresX)
#load price
priceY = np.loadtxt('priceY.dat',dtype=int)
priceY = np.matrix(priceY).T
theta = np.matrix('1.30728554;-113.50022949')
costFunc = graddesc.gradDesc(featuresX, priceY, theta)
print(costFunc)



#-------- reading multiple files and concatenating

#multiple files concatenating into one
filelist = os.listdir(r'C:\Users\avishek\Downloads\Datasets')
#print(filelist)
df_list = [pd.read_csv(r'C:\Users\avishek\Downloads\Datasets\\'+file,header=None) for file in filelist]


df = pd.concat(df_list,axis=0)
#print(df_list[0].head())
print(df.head())
print(df.shape)

#------- same thing as above using glob

path = r'C:\Users\avishek\Downloads\Datasets'
filelist = glob(path + '/*.*.csv')
print(filelist) 

df_list = [pd.read_csv(f, header=None) for f in filelist]





#--computing in data -----------


#computing on data

a = np.matrix('1, 2 ; 3, 4 ; 5, 6',)
#print(a)
b = np.matrix('11, 12 ; 13, 14 ; 15, 16')
c = np.matrix('1, 1 ; 2, 2')

d = np.dot(a,c)
print(d)
print()
#element wise multiplication
e = np.multiply(a,b)
print(e)
print()
print(np.square(a))
print()

print(np.reciprocal(a.astype(float)))
print()
print(np.log(a))
print()
m = np.matrix('-1, 2 ; -3, 4 ; 5, -6',)
print(np.absolute(m))
print()
print(-m)
print()
#element wise addition
print(m+1)
print()
print(np.max(m, axis=0))
print()
#give the index after flattening
print(m.argmax())
print()
print(m[m>0])
print()
print(np.sum(m, axis=1))
print()
print(np.cumsum(m, axis=1))
print()
print(np.prod(m))
print()

#-------- plotting

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mp

t = np.arange(0,0.98,0.01)
#print(t)
y = np.sin(2*np.pi*4*t)

plt.plot(t,y,linewidth=2.0)
#plt.plot(y)

#plt.plot([1,2,3,4])
#plt.ylabel('some numbers')
#when given only one vector its treated as Y and x is generated automatically
#as there are 4 elements for y so x will have 4 elements and as index start from 
#0 the four elements of x = [0,1,2,3]

x=np.array([1,2,3,4])
y = np.square(x)
# we can define the variable like this
line, = plt.plot(x, y, '-')
print(line)
#and we can apply different properties on the variable
plt.setp(line, color='r')
#setp = settable line properties
# or plt.setp(lines, 'color', 'r') its same as above its MATLAB style
line.set_antialiased(False)
print(y)

plt.plot(x,y)
plt.plot(x,y,'ro')
#ro means red o's

# evenly sampled time at 200ms intervals
t = np.arange(0., 5., 0.2)
#multiple graphs in single plot using different formatting
# red dashes, blue squares and green triangles
plt.plot(t, t, 'r--', t, t**2, 'bs', t, t**3, 'g^')
plt.show()


def f(t):
    return np.exp(-t) * np.cos(2*np.pi*t)

t1 = np.arange(0.0, 5.0, 0.1)
t2 = np.arange(0.0, 5.0, 0.02)

plt.figure(1)
plt.subplot(211)
plt.plot(t1, f(t1), 'bo', t2, f(t2), 'k')

plt.subplot(212)
plt.plot(t2, np.cos(2*np.pi*t2), 'r--')
plt.show()

#subplot(211) is equi to subplot(2,1,1) , where 2=num of rows 1=no of col 1=fig number

plt.figure(1)                # the first figure
plt.subplot(211)             # the first subplot in the first figure
plt.plot([1, 2, 3])
plt.subplot(212)             # the second subplot in the first figure
plt.plot([4, 5, 6])


plt.figure(2)                # a second figure
plt.plot([4, 5, 6])          # creates a subplot(111) by default

plt.figure(1)                # figure 1 current; subplot(212) still current
plt.subplot(211)             # make subplot(211) in figure1 current
plt.title('Easy as 1, 2, 3') # subplot 211 title
#plt.subplot(212)             # make subplot(211) in figure1 current
#plt.title('Easy as 1, 2, 3') # subplot 212 title

np.random.seed(0)
print(np.random.rand(4))


# Fixing random state for reproducibility
np.random.seed(19680801)

mu, sigma = 100, 15
x = mu + sigma * np.random.randn(10000)

# the histogram of the data
n, bins, patches = plt.hist(x, 50, normed=1, facecolor='g', alpha=0.75)


plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of IQ')
plt.text(50, .025, r'$\mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)
plt.show()

#-- plot regression ---------

featuresX = np.loadtxt(r'C:\Users\abhishek_singh79\Desktop\featuresX.txt',dtype=int,delimiter=',')
#print(featuresX[:2,:])
priceY = np.loadtxt(r'C:\Users\abhishek_singh79\Desktop\PriceY.txt',dtype=int)
#print(priceY[2])

reg = LinearRegression()

reg.fit(featuresX,priceY)

print (reg.predict([1534,3]))


plt.plot(featuresX[:,0],priceY,'bo')
plt.plot(featuresX[:,1],priceY,'ro')
plt.plot(featuresX,reg.predict(featuresX),'k--')


print(reg.coef_)
print(reg.intercept_)
#print(np.prod([1.30728554,-113.50022949],[1534,3]))
a=np.matrix([1.30728554,-113.50022949])
b=np.matrix([1534,3])
print(np.sum(np.multiply(a,b))+reg.intercept_)

# which means our equation after regression is y = c + theta(0)X(0) + theta(1)X(1)
# where c = reg.intercept_ and theta(0) theta(1) = reg.coef_

#also we can write the above thing as

m = [1534, 3]
print(reg.coef_)
print(reg.intercept_)
print(sum(reg.coef_ * m))
print(sum(reg.coef_ * m)+reg.intercept_)

#-----

#sin cos again using subplot
t = np.arange(0,0.98,0.01)
y1 = np.sin(2*np.pi*4*t)
y2 = np.cos(2*np.pi*4*t)

plt.figure(1)
plt.subplot(211)
plt.plot(t,y1)
plt.ylabel('sin')
plt.legend('sin')
plt.subplot(212)
plt.plot(t,y2)
plt.xlabel('time')
plt.ylabel('cos')
#also u can set axis range .. all 4 necessary xmin xmax ymin ymax
#plt.axis([0.5, 1, 0, 1])
plt.show()
#saving plots
plt.savefig('plot.png')
plt.savefig('plot.pdf')

#visualise a matrix

a = np.matrix('2, 3, 4; 5, 6, 7; 8, 9, 1')
plt.imshow(a,cmap='gray')
#for colorful remove cmap
plt.colorbar()
plt.show()

#----------- basic gradient descent (cost function)------------

#the below code is in a seperate .py file named graddesc
import numpy as np
def gradDesc(X, y, theta):
    m = X.shape[0] #no of training examples
    prediction = np.dot(X,theta) # x into theta
    sqErrors = np.square(prediction - y)
    return 0.5/m*np.sum(sqErrors,axis=0)


import graddesc #importing above file

featuresX = np.loadtxt('featuresX.dat',dtype=int,delimiter=',')
featuresX = np.matrix(featuresX)
#load price
priceY = np.loadtxt('priceY.dat',dtype=int)
priceY = np.matrix(priceY).T
theta = np.matrix('1.30728554;-113.50022949')
costFunc = graddesc.gradDesc(featuresX, priceY, theta)
print(costFunc)

#--------- basic minimization 

import scipy.optimize as op

#for example we have a cost function j(theta) = (theta1 - 5)^2 + (theta2 - 5)^2
#so our first diff becomes = 2*(theta1 - 5) and second diff becomes 2*(theta2 - 5)


def Gradient(theta):
    gradient = np.zeros(2)
    gradient[0] = 2 * (theta[0] - 5)
    gradient[1] = 2 * (theta[1] - 5)
    return gradient


def cosFunc(theta): #function to minimize
    costF = (theta[0] - 5)*(theta[0] - 5) + (theta[1] - 5)*(theta[1] - 5)
    return costF

initialtheta = np.zeros(2,dtype=int)

costF = op.minimize(fun=cosFunc,x0=initialtheta,jac=Gradient)
print(costF)

#---------- Logistic Regression --------------------

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mp
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

min_max = MinMaxScaler()
le = LabelEncoder()

datapath = os.listdir(r'C:\Users\avishek\Downloads\Datasets')
datafiles = [pd.read_csv(r'C:\Users\avishek\Downloads\Datasets\\' + file, header=None) for file in datapath]
data = pd.concat([datafiles[0], datafiles[1]], axis=0)
# print(data.shape)

# dATAFRAME TO MATRIX
data = pd.DataFrame.as_matrix(data)

data[data == '?'] = '0'

# TEXT TO NUMERIC PROCESSING - below code works
#for rows in range(data.T.shape[0]):
#    le.fit(data.T[rows])
#   data.T[rows] = le.transform(data.T[rows])
    
#better way for above code
# TEXT TO NUMERIC PROCESSING
for rows in range(data.T.shape[0]):
    if type(data.T[rows][0]) is str:
     #print(type(data.T[rows][0]) is str)
        le.fit(data.T[rows])
        data.T[rows] = le.transform(data.T[rows])
    



# data = pd.DataFrame.as_matrix(data)
# print(data[:0,:])
# DATA MATRIX TO INT
data = data.astype('float32', copy=False)
# print(data[:4,:])
# print(data[:1,:])

# SPLITTING TO X and Y
x_data = data[:, :-1]
y_data = data[:, -1:]


# DATA SCALING - try not to scale Y as it will treat y as continuos but we need discrete for y values
x_data = min_max.fit_transform(x_data)

X_train, X_test = train_test_split(x_data, test_size=0.3, random_state=30)
Y_train, Y_test = train_test_split(y_data, test_size=0.3, random_state=30)

# print(X_train.dtype, Y_train.dtype)
# LOGISTIC REGRESSION
log = LogisticRegression(penalty='l2', C=.01)
log.fit(X_train, Y_train.flatten())

# ACCURACY
print(accuracy_score(Y_test, log.predict(X_test)))
# without feature scaling accuracy near to 50%



# test = np.matrix('1, 3;2, 3')
# print(test.dtype)

# print(Y_train.flatten()[:4])
# print(data.shape)


#--------------- random forest - Titanic Data set ---------- 

X = pd.read_csv(r'C:\Users\avishek\Downloads\titanicData\train.csv')

#pops out a column from the df
y = X.pop("Survived")

#print(y)
#describe only prints and describes the numerical data
#print(X.describe())

#as from describe we saw that age is having missing values
#so we need to fill those up with the mean values
X['Age'].fillna(X.Age.mean(), inplace=True)
#print(X.describe())

#now get only the numeric variables ie that are not object type
numeric_vars = list(X.dtypes[X.dtypes != 'object'].index) #list of index of numeric data
#here we get Pclass even though its a categorical var (it have few distinct values)
#print(X[numeric_vars].head())

model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42)
model.fit(X[numeric_vars], y)
#all the methods with trailing underscore are only available after the model is trained
print(model.oob_score_)

y_oob = model.oob_prediction_
print('c-stat:',roc_auc_score(y, y_oob))
#print(y_oob.shape)



X.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

#categorical data
cat_var_index = list(X.dtypes[X.dtypes == 'object'].index) #list of index of categorical data
# print()
# for cols in X.columns.values:
#     if X[cols].dtypes == 'object':
#         print(X[cols].index)
# print()
#print(X[cat_vars].describe())

def clean_cabin(x):
    try:
        return x[0]
    except TypeError:
        return 'None'

X['Cabin'] = X.Cabin.apply(clean_cabin)
#print(X['Cabin'])

cat_var = list(X[cat_var_index].columns.values)
#print(cat_var)

for var in cat_var:
    X[var].fillna('Missing', inplace=True)
    #dummies is like that label encoder which makes seperate column for each disting value and values are 0 and 1
    dummies = pd.get_dummies(X[var], prefix=var)
    X = pd.concat([X, dummies], axis=1)
    X.drop([var], axis=1, inplace=True)

#print(X.columns.values)
print()
model = RandomForestRegressor(n_estimators=100, oob_score=True, n_jobs=1, random_state=42)
model.fit(X, y)
#all the methods with trailing underscore are only available after the model is trained
#print(model.oob_score_)

y_oob = model.oob_prediction_
print('c-stat:',roc_auc_score(y, y_oob))
print()
#below code will show the importance of each features
#print(model.feature_importances_)
#but its not very clear so below code

features_importance = pd.Series(model.feature_importances_, index=X.columns)
features_importance.sort_values(ascending=False, inplace=True)
#print(features_importance)
#features_importance.plot(kind='barh', figsize=(7,6))
#plotting above data for good visualization
fig,ax = plt.subplots()
ax.bar(np.arange(len(features_importance)), features_importance.values, width=0.8)
ax.set_xticks(np.arange(len(features_importance)))
ax.set_xticklabels(features_importance.index, rotation='vertical')
plt.show()

#------------------------ ------------------------
From Data to Data Science

Search This Blog

some Python (Ng)

Comments

Post a Comment