Skip to main content

some Python (Ng)


------------------------------------------------------------- Ng ---------------------------------------------- #matrix a = np.matrix([[1,2,3], [2,3,4]]) print(a) print() # 'and' and logical end are different eg print ('bhai' and 'behen') # prints the second one print ('bhai' or 'behen') # prints the first one # print ('bhai' | 'behen') # error print (1010 | 101) # converts both to binary and does 'or' or 'and'(here they appear binary but they are not) print(1010 and 123) # same the second one it will print print() #formatting print('the value of pi is : {:0.2f}'.format(np.pi)) print() # b = np.atleast_2d(a) --- try this b = np.matrix([1,2,3]) print(b) print(b.shape) c = b.T print(c) print(c.shape) # matrix vs array .. matrix are 2d arrays this is subclass of ndarray # a*b in matrix is equal to np.dot(a,b) in arrays print() d = np.matrix('1; 2; 3') print(d) print(d.dtype) #also read .mat, asmatrix, asarray print(np.ones((2,3))) print() print(np.random.randint(0, 10,(3, 4))) print() print(np.random.random((3, 4))) #Return random floats in the half-open interval [0.0, 1.0) print( ) print(np.random.randn(5)) #Return a sample (or samples) from the “standard normal” distribution. w = -6 + mt.sqrt(10) * (np.random.randn((10000))) #plot = plt.hist(w,50) #plt.show(plot) print() print(np.eye(3)) #print(help(np.random)) ########## moving data around ---------------- a = np.array([[1,2,3], [2,3,4]]) print(a) print() b = np.matrix('1, 2, 3 ; 2, 3, 4') print(b) print(len(b[1])) c = [1,2,3] d = [2,3,4] print() e = np.array([c, d]) print(e) print() f = np.matrix([c, d]) print(f) print('---') a = np.array([1,2,3]) print(b.size) print() #equivalent to ls command #ls = [f for f in os.listdir('.') if os.path.isfile(f)] ls = [os.listdir(os.curdir)] #or ls = [os.listdir(.)] print(ls) #pwd present print(os.getcwd()) #--------------------- #load features featuresX = np.loadtxt('featuresX.dat',dtype=int,delimiter=',') featuresX = np.matrix(featuresX) print(featuresX) #load price priceY = np.loadtxt('priceY.dat',dtype=int) priceY = np.matrix(priceY).T print() print(priceY) print() #print(featuresX[1,0]) print(featuresX.shape) print(priceY.shape) print() #print(priceY[:10]) #v = priceY[:10] #print(v) #saving data in file #np.savetxt('v.dat',v,fmt='%.0f') #print 2nd and 4th row -- 0 index matrix it is print(featuresX[[1, 3],:]) a = np.matrix('1, 2 ; 2, 3 ; 3, 4') print(a) #changing second column of the above matrix a[:,1] = np.matrix('10 ; 11 ; 12') print() print(a) print() #add one more columns a = np.append(a,np.matrix('100 ; 231 ; 112'),axis=1) print(a) #flatting a matrix print(a.flatten()) c = np.matrix('1, 2 ; 3, 4') d = np.matrix('11, 12 ; 13, 14') # appending matrices side by side e = np.hstack((c, d)) print() print(e) #---------- basic linear regression ... house data import numpy as np import pandas as pd import os import matplotlib.pyplot as plt import csv import math from sklearn import linear_model #from sklearn.cross_validation import train_test_split # df = pd.read_csv(r'C:\Users\avishek\Downloads\Datasets\adult.data.csv', header=None) # #print(df.head()) # df.columns =['age','workclass','fnlwgt','education','education-num', # 'marital-status','occupation','relationship','race','sex', # 'capital-gain','capital-loss','hours-per-week','native-country','last-column'] # #print(df[df['workclass'] == 'State-gov'].head()) # #print(df.head()) # # #print(df['workclass']==' State-gov') # print(df['sex'].unique()) featuresX = np.loadtxt('featuresX.dat', dtype=int, delimiter=',') priceY = np.loadtxt('priceY.dat',dtype=int) priceY = np.matrix(priceY).T regr = linear_model.LinearRegression() regr.fit(featuresX,priceY) # plt.scatter(featuresX[:,0], priceY, color='black') # plt.show() # plt.scatter(featuresX[:,1], priceY, color='black') # plt.show() print(regr.predict((2104,3))) print() print(regr.predict([1416,2])) #--- same code using excel df = pd.read_excel(r'C:\Users\avishek\Downloads\housing.xlsx',sheetname='Sheet1',header=None) #print(df) F_X = np.matrix(df[[0,1]]) #print(F_X) P_Y = np.matrix(df[2]).T regr = linear_model.LinearRegression() regr.fit(X=F_X,y=P_Y) print(regr.predict([1203, 3])) #also we cant use accuracy_score with regression problem #----------- basic gradient descent ------------ #the below code is in a seperate .py file named graddesc import numpy as np def gradDesc(X, y, theta): m = X.shape[0] #no of training examples prediction = np.dot(X,theta) # x into theta sqErrors = np.square(prediction - y) return 0.5/m*np.sum(sqErrors,axis=0) import graddesc #importing above file featuresX = np.loadtxt('featuresX.dat',dtype=int,delimiter=',') featuresX = np.matrix(featuresX) #load price priceY = np.loadtxt('priceY.dat',dtype=int) priceY = np.matrix(priceY).T theta = np.matrix('1.30728554;-113.50022949') costFunc = graddesc.gradDesc(featuresX, priceY, theta) print(costFunc) #-------- reading multiple files and concatenating #multiple files concatenating into one filelist = os.listdir(r'C:\Users\avishek\Downloads\Datasets') #print(filelist) df_list = [pd.read_csv(r'C:\Users\avishek\Downloads\Datasets\\'+file,header=None) for file in filelist] df = pd.concat(df_list,axis=0) #print(df_list[0].head()) print(df.head()) print(df.shape) #------- same thing as above using glob path = r'C:\Users\avishek\Downloads\Datasets' filelist = glob(path + '/*.*.csv') print(filelist) df_list = [pd.read_csv(f, header=None) for f in filelist] #--computing in data ----------- #computing on data a = np.matrix('1, 2 ; 3, 4 ; 5, 6',) #print(a) b = np.matrix('11, 12 ; 13, 14 ; 15, 16') c = np.matrix('1, 1 ; 2, 2') d = np.dot(a,c) print(d) print() #element wise multiplication e = np.multiply(a,b) print(e) print() print(np.square(a)) print() print(np.reciprocal(a.astype(float))) print() print(np.log(a)) print() m = np.matrix('-1, 2 ; -3, 4 ; 5, -6',) print(np.absolute(m)) print() print(-m) print() #element wise addition print(m+1) print() print(np.max(m, axis=0)) print() #give the index after flattening print(m.argmax()) print() print(m[m>0]) print() print(np.sum(m, axis=1)) print() print(np.cumsum(m, axis=1)) print() print(np.prod(m)) print() #-------- plotting import numpy as np import matplotlib.pyplot as plt import matplotlib as mp t = np.arange(0,0.98,0.01) #print(t) y = np.sin(2*np.pi*4*t) plt.plot(t,y,linewidth=2.0) #plt.plot(y) #plt.plot([1,2,3,4]) #plt.ylabel('some numbers') #when given only one vector its treated as Y and x is generated automatically #as there are 4 elements for y so x will have 4 elements and as index start from #0 the four elements of x = [0,1,2,3] x=np.array([1,2,3,4]) y = np.square(x) # we can define the variable like this line, = plt.plot(x, y, '-') print(line) #and we can apply different properties on the variable plt.setp(line, color='r') #setp = settable line properties # or plt.setp(lines, 'color', 'r') its same as above its MATLAB style line.set_antialiased(False) print(y) plt.plot(x,y) plt.plot(x,y,'ro') #ro means red o's # evenly sampled time at 200ms intervals t = np.arange(0., 5., 0.2) #multiple graphs in single plot using different formatting # red dashes, blue squares and green triangles plt.plot(t, t, 'r--', t, t**2, 'bs', t, t**3, 'g^') plt.show() def f(t): return np.exp(-t) * np.cos(2*np.pi*t) t1 = np.arange(0.0, 5.0, 0.1) t2 = np.arange(0.0, 5.0, 0.02) plt.figure(1) plt.subplot(211) plt.plot(t1, f(t1), 'bo', t2, f(t2), 'k') plt.subplot(212) plt.plot(t2, np.cos(2*np.pi*t2), 'r--') plt.show() #subplot(211) is equi to subplot(2,1,1) , where 2=num of rows 1=no of col 1=fig number plt.figure(1) # the first figure plt.subplot(211) # the first subplot in the first figure plt.plot([1, 2, 3]) plt.subplot(212) # the second subplot in the first figure plt.plot([4, 5, 6]) plt.figure(2) # a second figure plt.plot([4, 5, 6]) # creates a subplot(111) by default plt.figure(1) # figure 1 current; subplot(212) still current plt.subplot(211) # make subplot(211) in figure1 current plt.title('Easy as 1, 2, 3') # subplot 211 title #plt.subplot(212) # make subplot(211) in figure1 current #plt.title('Easy as 1, 2, 3') # subplot 212 title np.random.seed(0) print(np.random.rand(4)) # Fixing random state for reproducibility np.random.seed(19680801) mu, sigma = 100, 15 x = mu + sigma * np.random.randn(10000) # the histogram of the data n, bins, patches = plt.hist(x, 50, normed=1, facecolor='g', alpha=0.75) plt.xlabel('Smarts') plt.ylabel('Probability') plt.title('Histogram of IQ') plt.text(50, .025, r'$\mu=100,\ \sigma=15$') plt.axis([40, 160, 0, 0.03]) plt.grid(True) plt.show() #-- plot regression --------- featuresX = np.loadtxt(r'C:\Users\abhishek_singh79\Desktop\featuresX.txt',dtype=int,delimiter=',') #print(featuresX[:2,:]) priceY = np.loadtxt(r'C:\Users\abhishek_singh79\Desktop\PriceY.txt',dtype=int) #print(priceY[2]) reg = LinearRegression() reg.fit(featuresX,priceY) print (reg.predict([1534,3])) plt.plot(featuresX[:,0],priceY,'bo') plt.plot(featuresX[:,1],priceY,'ro') plt.plot(featuresX,reg.predict(featuresX),'k--') print(reg.coef_) print(reg.intercept_) #print(np.prod([1.30728554,-113.50022949],[1534,3])) a=np.matrix([1.30728554,-113.50022949]) b=np.matrix([1534,3]) print(np.sum(np.multiply(a,b))+reg.intercept_) # which means our equation after regression is y = c + theta(0)X(0) + theta(1)X(1) # where c = reg.intercept_ and theta(0) theta(1) = reg.coef_ #also we can write the above thing as m = [1534, 3] print(reg.coef_) print(reg.intercept_) print(sum(reg.coef_ * m)) print(sum(reg.coef_ * m)+reg.intercept_) #----- #sin cos again using subplot t = np.arange(0,0.98,0.01) y1 = np.sin(2*np.pi*4*t) y2 = np.cos(2*np.pi*4*t) plt.figure(1) plt.subplot(211) plt.plot(t,y1) plt.ylabel('sin') plt.legend('sin') plt.subplot(212) plt.plot(t,y2) plt.xlabel('time') plt.ylabel('cos') #also u can set axis range .. all 4 necessary xmin xmax ymin ymax #plt.axis([0.5, 1, 0, 1]) plt.show() #saving plots plt.savefig('plot.png') plt.savefig('plot.pdf') #visualise a matrix a = np.matrix('2, 3, 4; 5, 6, 7; 8, 9, 1') plt.imshow(a,cmap='gray') #for colorful remove cmap plt.colorbar() plt.show() #----------- basic gradient descent (cost function)------------ #the below code is in a seperate .py file named graddesc import numpy as np def gradDesc(X, y, theta): m = X.shape[0] #no of training examples prediction = np.dot(X,theta) # x into theta sqErrors = np.square(prediction - y) return 0.5/m*np.sum(sqErrors,axis=0) import graddesc #importing above file featuresX = np.loadtxt('featuresX.dat',dtype=int,delimiter=',') featuresX = np.matrix(featuresX) #load price priceY = np.loadtxt('priceY.dat',dtype=int) priceY = np.matrix(priceY).T theta = np.matrix('1.30728554;-113.50022949') costFunc = graddesc.gradDesc(featuresX, priceY, theta) print(costFunc) #--------- basic minimization import scipy.optimize as op #for example we have a cost function j(theta) = (theta1 - 5)^2 + (theta2 - 5)^2 #so our first diff becomes = 2*(theta1 - 5) and second diff becomes 2*(theta2 - 5) def Gradient(theta): gradient = np.zeros(2) gradient[0] = 2 * (theta[0] - 5) gradient[1] = 2 * (theta[1] - 5) return gradient def cosFunc(theta): #function to minimize costF = (theta[0] - 5)*(theta[0] - 5) + (theta[1] - 5)*(theta[1] - 5) return costF initialtheta = np.zeros(2,dtype=int) costF = op.minimize(fun=cosFunc,x0=initialtheta,jac=Gradient) print(costF) #---------- Logistic Regression -------------------- import numpy as np import matplotlib.pyplot as plt import matplotlib as mp from sklearn.linear_model import LogisticRegression import pandas as pd import os from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score min_max = MinMaxScaler() le = LabelEncoder() datapath = os.listdir(r'C:\Users\avishek\Downloads\Datasets') datafiles = [pd.read_csv(r'C:\Users\avishek\Downloads\Datasets\\' + file, header=None) for file in datapath] data = pd.concat([datafiles[0], datafiles[1]], axis=0) # print(data.shape) # dATAFRAME TO MATRIX data = pd.DataFrame.as_matrix(data) data[data == '?'] = '0' # TEXT TO NUMERIC PROCESSING - below code works #for rows in range(data.T.shape[0]): # le.fit(data.T[rows]) # data.T[rows] = le.transform(data.T[rows]) #better way for above code # TEXT TO NUMERIC PROCESSING for rows in range(data.T.shape[0]): if type(data.T[rows][0]) is str: #print(type(data.T[rows][0]) is str) le.fit(data.T[rows]) data.T[rows] = le.transform(data.T[rows]) # data = pd.DataFrame.as_matrix(data) # print(data[:0,:]) # DATA MATRIX TO INT data = data.astype('float32', copy=False) # print(data[:4,:]) # print(data[:1,:]) # SPLITTING TO X and Y x_data = data[:, :-1] y_data = data[:, -1:] # DATA SCALING - try not to scale Y as it will treat y as continuos but we need discrete for y values x_data = min_max.fit_transform(x_data) X_train, X_test = train_test_split(x_data, test_size=0.3, random_state=30) Y_train, Y_test = train_test_split(y_data, test_size=0.3, random_state=30) # print(X_train.dtype, Y_train.dtype) # LOGISTIC REGRESSION log = LogisticRegression(penalty='l2', C=.01) log.fit(X_train, Y_train.flatten()) # ACCURACY print(accuracy_score(Y_test, log.predict(X_test))) # without feature scaling accuracy near to 50% # test = np.matrix('1, 3;2, 3') # print(test.dtype) # print(Y_train.flatten()[:4]) # print(data.shape) #--------------- random forest - Titanic Data set ---------- X = pd.read_csv(r'C:\Users\avishek\Downloads\titanicData\train.csv') #pops out a column from the df y = X.pop("Survived") #print(y) #describe only prints and describes the numerical data #print(X.describe()) #as from describe we saw that age is having missing values #so we need to fill those up with the mean values X['Age'].fillna(X.Age.mean(), inplace=True) #print(X.describe()) #now get only the numeric variables ie that are not object type numeric_vars = list(X.dtypes[X.dtypes != 'object'].index) #list of index of numeric data #here we get Pclass even though its a categorical var (it have few distinct values) #print(X[numeric_vars].head()) model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42) model.fit(X[numeric_vars], y) #all the methods with trailing underscore are only available after the model is trained print(model.oob_score_) y_oob = model.oob_prediction_ print('c-stat:',roc_auc_score(y, y_oob)) #print(y_oob.shape) X.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True) #categorical data cat_var_index = list(X.dtypes[X.dtypes == 'object'].index) #list of index of categorical data # print() # for cols in X.columns.values: # if X[cols].dtypes == 'object': # print(X[cols].index) # print() #print(X[cat_vars].describe()) def clean_cabin(x): try: return x[0] except TypeError: return 'None' X['Cabin'] = X.Cabin.apply(clean_cabin) #print(X['Cabin']) cat_var = list(X[cat_var_index].columns.values) #print(cat_var) for var in cat_var: X[var].fillna('Missing', inplace=True) #dummies is like that label encoder which makes seperate column for each disting value and values are 0 and 1 dummies = pd.get_dummies(X[var], prefix=var) X = pd.concat([X, dummies], axis=1) X.drop([var], axis=1, inplace=True) #print(X.columns.values) print() model = RandomForestRegressor(n_estimators=100, oob_score=True, n_jobs=1, random_state=42) model.fit(X, y) #all the methods with trailing underscore are only available after the model is trained #print(model.oob_score_) y_oob = model.oob_prediction_ print('c-stat:',roc_auc_score(y, y_oob)) print() #below code will show the importance of each features #print(model.feature_importances_) #but its not very clear so below code features_importance = pd.Series(model.feature_importances_, index=X.columns) features_importance.sort_values(ascending=False, inplace=True) #print(features_importance) #features_importance.plot(kind='barh', figsize=(7,6)) #plotting above data for good visualization fig,ax = plt.subplots() ax.bar(np.arange(len(features_importance)), features_importance.values, width=0.8) ax.set_xticks(np.arange(len(features_importance))) ax.set_xticklabels(features_importance.index, rotation='vertical') plt.show() #------------------------ ------------------------

Comments