Skip to main content

some Python (Numpy Pandas)

############ numpy ############################# #importing numpy import numpy as np #numpy matrices are strictly 2d and numpy arrays can be multidimensional, #numpy matrices are subclass of numpy nd array #creating numpy array from list mylist = [1,3,6,2,7,4,8] x = np.array(mylist) print(x) x=np.array([1,3,6,2,7,4,8]) print(x) #creating 2d array x=np.array([[1,2,3],[2,3,4]]) print(x) ## if you do something like below x=np.array([[1,2,3],[2,3]])#lenth of two lists dont match print(x) #you will get the output as [list([1, 2, 3]) list([2, 3])] #shape of the array x.shape#outputs a tuple (row, column) #like range n = np.arange(0,11,2) #np.arange(start, end, gap) print(n) #[0,2,4,6,8,10]#always excludes the last value #reshaping arrays m = n.reshape(2,3)#here we are reshaping it in 2 by 2d array print(m) #also you can give only the row or column value and ignore the other value by giving it -1 #like n.reshape(-1,3) = n.reshape(2,-1) = n.reshape(2,3) for above array #if you want to reshape a 7 element array in to 2,3 2d array, it will give error # dividing a range into equal spaces o = np.linspace(1,10,20)#here we divide 1 to 10 into 20 equidistant numbers print(o) # resizes #not same as reshaping p = np.resize(o,(5,5)) #as we can see o only has 20 elements but then also we are able to make a 5,5(=25 elements) 2d array out of it #similarly we can create like 2,2 2d array from it, it will use less numbers when we want to create smaller number and #it will use the number again when the new array has size larger than the original array print(p) #array of ones print(np.ones((3,2))) #arrays of zeros print(np.zeros((3,2))) #identity 2d array print(np.eye((3))) #diagonal print(np.diag([2,3,4])) #scalar multiplication print(np.array([1,2,3])*2) print(np.array(([1,2,3],[2,3,4]))*2) #doubling/repeating all elements the elements print(np.array([1,2,3]*2))#here [1,2,3]*2 = [1,2,3,1,2,3] print(np.array(([1,2,3],[2,3,4])*2))#similarly this we can think of (1,2)*2=(1,2,1,2), here 1=[1,2,3] and 2=[2,3,4] #repeat each element 3rice print(np.repeat([1,2,3],3)#output = [1,1,1,2,2,2,3,3,3] #defining that the array will have float p = np.ones([2,3],float) print(p) #stack 2 arrays vertically print(np.vstack([p,2*p]))#here we are stacking p over 2p # stack 2 arrays horzontal print(np.hstack([p,2*p]))#here we are stacking 2p in front of 2p #basic array operation, these operations are element wise x = np.array([1,2,3]) y = np.array([3,4,5]) print(x+y) print(x*y) print(x-y) print(x.dot(y))#3*1 + 4*2 + 5*3 a = np.array([-2,-1,5,-5,6,8,4,2]) print(a.sum()) print(a.max()) print(a.mean()) #standard deviation print(a.std()) #print index of max or min print(a.argmax())#similar argmin #transpose of an array y = np.array([x,x**2])#this is equi to np.array([[1,2,3],[1,4,9]]) print(y) #transpose print(y.T) #reciprocal np.reciprocal(a.astype(float) #absoute np.absolute(a) #log np.log(a) #data type of elemets of array, remember all elemets in np arrays will have same datatype print(y.dtype) x = np.array([[1,2,3],[1.2,2.3,3.4]])#here all the elemets will be implicitly cast to float #type casting explicitly y = y.astype('f') print(y.dtype) #indexing and slicing a = np.arange(13)**2 print(a) #for 1d array its same as list print(a[0],a[1],a[:3]) #for 2d array b = np.resize(a,(3,4)) print(b) #second+1 row second+1 column as index starts at 0 print(b[2,2]) #second+1 row all the columns print(b[2,:]) #1st and 2nd row, all but last column print(b[:2,:-1]) #print every second element from last row print(b[-1,1::2])#starts from 2nd column print(b[-1,::2])#starts from 1st column #conditonal print(b[b>20]) #prints all the element > 20 #changing a subclass of array changes the original array b[b>30] = 40 #if u print b after last statement all the values > 30 will become 30 b2 = b[:2,:2]#here b2 is sub array of b print(b2) b2[:]=0#here we are seeting b2 to 0 #changing in b2 changed b also print(b)#from here we can see the b also changes #to avoid this from happening we use copy function b = np.resize(a,(3,4)) print(b) b2 = b.copy() print(b2) b2[:] = 0 print(b) print(b2) #iterate over arrays rows test = np.random.randint(0,10,(4,3)) print(test) print('-------------------------') for row in test: print(row) print('-------------------------') for i in range(len(test)): print(test[i]) print('------------------------') #enumerate returns i , L[i] for i,row in enumerate(test): print('row',i+1,'is',row) test2 = test**2 print('-------------------------') #iterating through both array for i , j in zip(test,test2): print(i,'+',j,'=',i+j) #len of an array = no of rows #rank = no of dimensions #zipping and unzipping zp = list(zip([1,2,3],[3,4,5]))#zp = [(1, 3), (2, 4), (3, 5)] unzp = list(zip(*zp))#unzp = [(1, 2, 3), (3, 4, 5)] #some other things a = np.array([[1,2,3],[2,3,4]]) print(a) #no of rows print(len(a)) #no of columns print(a.shape[1]) #no of dimension print(a.ndim)#number of dimension mean if its a vector its 1 dimension if its a 2d array then its 2 dimension etc a = np.arange(15).reshape(3, 5) print(a.dtype.name)#same as a.dtype and will output the data type #pi print(np.pi) #example x = np.arange(0,2*np.pi,np.pi/2) print(np.sin(np.pi)) #exponential d = np.exp(a) #format ,you can write dtype= or simply complex b = np.array([(1,2,3),(3,4,5)],dtype=complex)#each element is in complex a + bi format, here all b's are 0 print(b) #function empty creates an array whose initial content is random and depends on the state of the memory. c = np.empty((2,3),float) print(c) #output limited values using ... #for e.g if we print np.arange(10000) we will get something like [1,2,3.....9999,10000] #so we can set how many max values we wanna display np.set_printoptions(threshold=20)#here value more than 20 will be shown using ... #dot product / matrix multiplication a = np.array([(1,2), (2,3)]) b = np.array([(2,3), (1,2)]) print(a*b)#this is element by element product #for dot product print(a.dot(b)) print(np.dot(a,b)) print(a@b) #all three above give same result which is the dot product or matrix multiplication #but if a and b were of matrix type not 2d array type we could have simply used a*b a1 = np.mat(a) b1 = np.mat(b) a1*b1 = a@b #check diff between np.mat and np.matrix #rowsum, columsum, cumulative sum e = np.arange(12).reshape(3,4) print(e) #sum of each column print(e.sum(axis=0)) #sum of each row print(e.sum(axis=1)) #cumulative sum of each row print(e.cumsum(axis=1)) #appending deleting n = np.array([1,2,3,4]) print(n) n = np.delete(n,3) print(n) n = np.append(n, 4) print(n) #random #need to specify range of random integers a = np.random.randint(1,5,(2,3)) print(a) #no need to specify range of random integers b = np.random.random((2,3)) print(b) #print reverse array c = np.arange(10) print(c) print(c[::-1]) #cube root d = np.array([-125, 1, -125, 27, -125, 125, 216, 343, 512, 729],dtype=int) print(d) #cube root of a no is possiby multivalued so might return NaN for i in d: print(i,i**(1/3)) #the below from function generate an 5,4 array of indexes eg , 00,01,02,03:10,11,12,13 so on #so for eg 11(row=1, col=1) is sent to f(x,y) it will return 10*1+1=11 = value in row 1 col 1 #remember index start from 0 so its basically 2nd row 2nd column value def f(x,y): return 10*x+y f = np.fromfunction(f,(5,4),dtype=int) print(f) #somethings print(f[:,-1]) is same as print(f[...,-1]) #also print(f[1,:]) is same as print(f[1]) is same as print(f[1,...]) #one thing to notice here the columns can be left blank e.g. f[1], but not rows e.g. f[,-1] is wrong #3D array (two stacked 2D arrays) g = np.array( [[[ 0, 1, 2], [ 10, 12, 13]], [[100,101,102], [110,112,113]]]) print() print(g) print() print(g[1,:,:])#we can think of this as 2 2d vertically stacked array so g[1] will give the 2nd 2d array print() print(g[...,2])#from both arrays take 3rd column print() #mean vs average #np.mean will return simple mean, but in np.average you can give additional parameters for weights #iterating over 3d array for row in g:#two loops over two 2d arrays print(row) #flatten the arrays for i in g.flat:#flats the array, the g.flat creates an iterator object print(i) for i in g.flatten():#.flatten() creates a new array object print(i) g.ravel()#similar to flatten but if you modify the ravel object it will modify the original array g.reshape((-1,)) or g.reshape((-1)) #or we can simply reshape #size g.size #gives you number of element in an array its simply row * columns * 3rd dim (if any) * so on np.resize(g, g.size)#we can use this also to flatten an array #transpose g.T np.transpose(g)#same as above but with added parametes to decide axes #repeat np.repeat(1,20)#this will create a numpy array of 20 elements all 1 #stacking a=[1,2,3] b=[4,5,6] print(np.column_stack([a,b])) print('----------') print(np.vstack([a,b])) print('----------') print(np.hstack([a,b])) print('----------') print(np.stack([a,b], axis=-1)) print('----------') print(np.concatenate((a, b), axis=0)) #also try the above with 2d arrays, also in the above functions we ca write like this np.vstack((a,b)), () instead of [] #to add dimension a = np.array([1,2,3,4]) a[:,np.newaxis]#this will convert the above 1d array to 2d array #np.r_('a,b,c',arr1,arr2) (it is row wise merging, axis 0) #a = axis to concatenate along #b = the minimum number of dimensions to force the entries to #c = which axis should contain the start of the arrays m = np.array([[0, 1, 2], [3, 4, 5]]) print(np.r_['-1',m,m]) #this is same as np.hstack((m,m)) #similar we have np.c_ (column wise merging, axis 1) #flooring np.floor(somedecimalarray) np.ceil(somedecimalarray) #split arrays print(np.hsplit(m,3))#horizontal split into 3 print() print(np.hsplit(m,(3,4))) # Split m after the third and the fourth column #0 to 2 one matrix , 3 one matrix , 4 to last one matrix #vsplit splits along the vertical axis, and array_split allows one to specify along which axis to split. #unique identifier id(m) #will give you one unique number. n=m #now id(m) will be equal to id(n) #view z = a.view() print(z) print(z is a)#false as z and a are diferent print(z.base is a)#true as z is view of data owned by a #if you change z's data it will also change a's data #use other array as indices a = np.arange(12)**2 # the first 12 square numbers print(a) print() i = np.array( [ 1,1,3,8,5 ] ) # an array of indices print(a[i]) # the elements of a at the positions i print() j = np.array( [ [ 3, 4], [ 9, 7 ] ] ) # a bidimensional array of indices print(a[j]) # the same shape as j #masking a = np.arange(12).reshape(3,4) print(a) print() b = [False,True,True] print(a[b]) print() print(a[[True,True,False]]) #indexing multi dim arrays y = np.arange(35).reshape(5,7) y[[0,2,4],[0,1,2]]#this means give me numbers which are at [0,0],[2,1] and [4,2] # for more visit https://docs.scipy.org/doc/numpy/user/basics.indexing.html #full np.full((2,2),3)#creates a 2d array size 2,2 with all elements as 3 #type conversion a.astype(float32) #sorting a.sort()#sorts a and if you now print a it will be sorted #I/O operation, saving loading #from numpy binary files np.load(file[, mmap_mode, allow_pickle,]) Load arrays or pickled objects from .npy, .npz or pickled files. np.save(file, arr[, allow_pickle, fix_imports]) Save an array to a binary file in NumPy .npy format. np.savez(file, *args, **kwds) Save several arrays into a single file in uncompressed .npz format. np.savez_compressed(file, *args, **kwds) Save several arrays into a single file in compressed .npz format. #from text files loadtxt(fname[, dtype, comments, delimiter,]) Load data from a text file. savetxt(fname, X[, fmt, delimiter, newline,]) Save an array to a text file. genfromtxt(fname[, dtype, comments,]) Load data from a text file, with missing values handled as specified. fromregex(file, regexp, dtype[, encoding]) Construct an array from a text file, using regular expression parsing. fromstring(string[, dtype, count, sep]) A new 1-D array initialized from text data in a string. ndarray.tofile(fid[, sep, format]) Write array to a file as text or binary (default). ndarray.tolist() Return the array as a (possibly nested) list. #** .copy() operation is called as deep copying #The term broadcasting describes how numpy treats arrays with different shapes during arithmetic operations. Subject to certain constraints, the smaller array is “broadcast” across the larger array so that they have compatible shapes #matrix a = np.matrix([[1,2,3], [2,3,4]]) print(a) #atleast_2d, converts the array into 2d if its dim < 2 b = np.atleast_2d(a) # matrix vs array .. matrix are 2d arrays this is subclass of ndarray # a*b in matrix is equal to np.dot(a,b) in arrays d = np.matrix('1; 2; 3')#creates a 2d matrix where each row each element is 1, 2 and 3 respectively print(d)#d.shape = 3,1 print(d.dtype) #also read .mat, asmatrix, asarray print(np.ones((2,3))) print() print(np.random.randint(0, 10,(3, 4))) print() print(np.random.random((3, 4))) #Return random floats in the half-open interval [0.0, 1.0) print( ) print(np.random.randn(5)) #Return a sample (or samples) from the “standard normal” distribution. #load and save data featuresX = np.loadtxt('featuresX.dat',dtype=int,delimiter=',') featuresX = np.matrix(featuresX) #saving data in file np.savetxt('file.dat',v,fmt='%.0f') ######################## pandas #################################### #series s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) print(s) #we can index series in a same way as numpy array, a[1:3] will give 2nd and 3rd row, a[::-1] will print in reverse #key value pair arr2 = {'india':'cricket','japan':'karate','china':'momo','france':'kiss','germany':'jews'} print(a)#this will print india, japan etc as index and cricket, karate etc as values #the below is applicable on for key,value paired iterables a = pd.Series(arr2,index={'india','china','japan','usa'}) print(a)#this will print only the indexed india, china, japan, for usa as we dont have any value so null will get printed #dataframe ser1 = pd.Series({'name':'abhishek', 'item':'peanut butter', 'price':300}) ser2 = pd.Series({'name':'jiya', 'item':'chocolate', 'price':60}) ser3 = pd.Series({'name':'badadum', 'item':'bums', 'price':100}) df1 = pd.DataFrame([ser1,ser2,ser3],index=['store1','store1','store2']) print(df1)#it will have name, item, price as columns #or you can define as below df1 = pd.DataFrame({'name':['abhishek','jiya','badadum'],'item':['peanut butter','chocolate','bums'],'price':[300,60,100]},index=['store1','store1','store2']) #or you can do like this df = pd.DataFrame([{'name':'abhishek','item':'peanut butter','price':300}, {'name':'jiya','item':'chocolate','price':60}, {'name':'badadum','item':'bums','price':100}], index=['store1','store1','store2'])#list of dictionaries #you can add new columns and data as simple df['Date'] = ['december 1', 'January 1', 'march 1']#add column called date with given values df['feedback'] = ['Positive', None, 'Negative']#other column showing you can add None values also df['Delievered'] = True#this will create a column named Delievered and all the values of that column will be True df['newcol'] = pd.Series({'store2' : 'someData'})#this will create a column newcol and value for index store2 will be someData and rest values will be NaN's #indexing df1.loc['store1']#print 1st and 2nd row df1.iloc[0]#will print the first row but as a series df1.iloc[0:1]#prints first row in dataframe format similarly we can do df1.iloc[:] df1.loc[:,['item','price']]#all rows and only item and price column df1.loc['store1':'store2']#it will output all the rows whose index is store1 till the row whose index is store2 including both in the same order as dataframe #transposing df1.T#the indexes will become columns and vice verse #delete remove or drop a column or row df1.drop('item', axis=1)#drops column original DF remains unchanged, you can use parameter inplace=True if you want to change the original DF or assign this to new variable #** remember try to use .copy() for storing the new DF unless you don't mind the original DF getting changed df1.drop('store1')#drops rows original DF remains unchanged df2 = df1.copy()#made a copy to made sure the df1 is not affected del df2['price'] #del is a common python keyword to remove any object, its not specific to pandas, in pandas we have drop as seen above #read csv excel and common things df = pd.read_csv(r'C:\Users\avishek\Downloads\oly.csv',index_col=0,skiprows=1) #here we are reading a csv file, where first column 0th index we are using as index and we are asking to skip 1 row as it might be not required df.head()#get first 5 rows of DF simiarly we have df.tail() df.keys()#get the column value as indexes df.columns#same as above df.columns.values#get the column values but in array format df.index#will give you all the index values #renaming indexes by removing the things after space e.g. if index is 'abc def' then we will get abc for i in df.index: df.rename(index={i:i.split(' ')[0]}, inplace=True) #similar we can do this for column names, if they are not proper #masking df.SomeColumn>0 #it will create a series of boolean values which we can use to filter dataframe df[df.SomeColumn>0]#this will give only those rows where value of SomeColumn is > 0 df.where(df.SomeColumn>0)#this will make all the columns of all the rows NaN's where SomeColumn is <=0 #the above NaN's data you can drop using dropna() function df.where(df.SomeColumn>0).dropna()#in drop na we have many parameters to set depending on what you want to drop df.where(df.SomeColumn>0, 10)#this will make all the columns of all the rows as 10 where SomeColumn is <=0 df.where(df.SomeColumn>0)['SomeColumn'].count()#counts the not null values from SomeColumn df[(df['col1']>5) & (df['col2']>5)]#also you can use multiple conditions with & or |, just remember to put paranthesis between conditions #appending data / rows to dataframe df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB')) df = df.append(df2, ignore_index=True)#ignore index in case the index not proper #concat pd.concat([df1, df2, df3])#we can concatenate dataframes #for above example for append df = pd.concat([df, df2], ignore_index=True) #**append is a special case of concat #remove duplicate values df.drop_duplicates(inplace=True) #get null count from each column df.apply(lambda x: sum(x.isnull())) #create a 2d numpy array from dataframe df.values #value counts df['SomeColumn'].value_counts() #this will give the value count of each of the distinct value in SomeColumn df.groupby('SomeColumn').size() #this will give same result as above with sorted index df.Gold.value_counts().sort_index() #this will give same result as above #sorting values #sorting index we have seen above df.sort_values(by=['col1'])#sort by col1 df.sort_values(by=['col1','col2'])#sort by col1 then within col1 sort by col2 #setting value of other columns after filtering on some other column df.loc[df.AAA >= 5, 'BBB'] = -1#here it will set the value of column BBB to -1 where column AAA>=5 df.loc[df.AAA >= 5, ['BBB', 'CCC']] = 555#here similarly we will set values of two columns based on other column #the above code can be also use for filtering df.loc[df.AAA >= 5, 'BBB']#give the values of column BBB where we have column AAA>=5 #masking using other dataframe df_mask = pd.DataFrame({'AAA': [True] * 4, 'BBB': [False] * 4, 'CCC': [True, False] * 2}) df.where(df_mask, -1000) #using the above mask we are setting the values of df dataframe to -1000 when False(as np.where sets the value to nan where false and replaces with the parameter values ) #changing index df.reset_index()#create one more column called index which will have index values and the index will become 0,1,2,3.. if you want to keep this change you can use inplace=True #you can do df = df.reset_index() df.set_index('col1')#sets col1 as index , and you can assign or do inplace=True as above to keep the changes df['indexcol'] = df.index#just in case you are making some index changes and you want to go back to old index, you can create a column where you can keep the indexes #also you can have multilevel indexing #get unique values df.col1.unique()#return the unique values of col1 df.col1.nunique()#return the number of unique values of col1, similar to len(df.col1.unique()) #join or merge #outer join or union print(pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)) #inner join or intersection print(pd.merge(staff_df, student_df, how='inner' , left_index=True, right_index=True)) #usually if wanna join on index as above we make the primary key column as or the column we wanna join on index or we can do as below # same we can do how='left' or how='right' # also if u want to join on columns instead of right and left_index # u write right_on and left_on and column name like right_on = 'name' # always we should have one from left_index or left_on and similarly right_index or right_on # also if suppose both dataframes have a common column names location # and we join on names then in the output we will get location_x and location_y , # x for the left df and y for the right df #method chaining df.where(df['col4'] == 50).dropna().set_index(['col0', 'col1']).rename(columns={'col2' : 'col3'}).head() #mapping mapp = {True:'TT', False:'FF'}#create a dict df['NewTestCol'] = df['NewTestCol'].map(mapp)#it replace False with FF and True with TT for column NewTestCol #apply def someFunc(row): data = row['col2']+row['col3']#col2 and col3 are some string columns so we are concatenating them return data df.apply(someFunc, axis=1).head()#applying that someFunc to the dataframe #another use is to count the nulls df.apply(lambda x: sum(x.isnull()))#by default the axis = 0, which means it will count the null for each column and return #string operator #like we have .dt for datetime thigs we have .str for string operation again these work on series s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) s.str.lower()#all the capitals will become lowercase and nan's will be nan's #group by df.groupby('YEAR')['someNumericColumn'].agg({'sum':np.sum, 'avg':np.average})#creates two column one sum other avg grouped by YEAR for column someNumericColumn df.groupby('YEAR')['someNumericColumn'].sum()#simple group by then sum #also you can write your own functions or use lambda function using apply like below df1.groupby('index')['item'].apply(lambda x : '-'.join(x))#here we are concatenating valus of column item after grouping on column index #also you can simply group by on a column and then it creates a grouped object which you can iterate through like below for group, frame in df.groupby(fun): print('there are ',str(len(frame)),'records in the group',str(group)) #datatype conversion #similar to np array here also we can use astype to convert to different datatype #we can also use astype to convert to ordered categorical datatype #bin or cut pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)#cuts the range 1 to 7 into three different parts and assign the values to those bins pd.qcut(np.array([1, 7, 5, 4, 6, 3]), 3)#cuts into 3 different regions and each cut will have equal number of elements 6/3 = 2 elements each #also you can define your own bins bins = [0, 1, 5, 10, 25, 50, 100]#here the bins will be (0, 1] < (1, 5] < (5, 10] < (10, 25] < (25, 50] < (50, 100] pd.cut(np.array([46.50,44.20,100.00,42.12]), bins) #also you can label your bins bins = [0, 1, 5, 10, 25, 50, 100] labels = ['not ok','ok','fine','good','very good','excellent']#as we have 7 elements which means 6 bins pd.cut(np.array([46.50,44.20,100.00,42.12]), bins=bins, labels=labels) #diff df.diff()#subtracts elements rowwise, for columnwise put axis=1, also you can give periods #pivot table df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=np.mean)#self explanatory #similary we can have multiple aggregate functions like aggfunc=[np.mean,np.min] #crosstab, here the data can be any array or series it does not have to be dataframe pd.crosstab(rows, columns, rownames=['someName'], colnames=['someName']) #example, also pass numpy arrays or series, not lists a = np.array([1,2,3]) b = np.array([3,4,5]) pd.crosstab(a,b) #pandas date time #convert to date time d1 = ['2 june 2016', 'Aug 29, 2014', '2015-06-26', '7/12/16'] t3 = pd.DataFrame(np.random.randint(10,100, (4,2)), index=d1, columns=list('ab')) t3.index = pd.to_datetime(t3.index)#using pd.to_datetime to convert to datetime #adding subtracting times pd.Timestamp('2016-09-02')-pd.Timestamp('2016-09-04') pd.Timestamp('2016-09-02')+pd.Timedelta('12D 3H') pd.Timestamp('2016-09-02')+pd.DateOffset(minutes=60) t = pd.Timestamp('2017-08-10 10:15:00.0 pm') #now you can use all sort of functions on t like t.quarter#gives quarter#gives t.weekday()#give week day number t.weekday_name#give sunday monday #creating date range pd.date_range('12-APR-2019', periods=9, freq='2W-SUN') #it will create a date range of 9 dates starting from first sunday after or on 12-Apr-19 will 2 weeks difference #e.g. 14th is the first sunday after 12th so first date will be 14th the second date will be 2 weeks later i.e. 28th and so on pd.date_range('2017 Jul 15 10:15', periods=5, freq='-1D')#reverse time range #also you can give start and end datetime and time zone pd.date_range(start='2017 Jul 15 10:15', end='2017 Jul 18 10:15', freq='8H', tz = 'Asia/Hong_Kong') #for only date no time we use normalize pd.date_range(start='2017 Jul 15 10:15', end='2017 Jul 18 10:15', freq='D', normalize=True) #period p = pd.Period('8/2017')#this mean full august of 2017 #if you wanna do some comparision like if aug-12-2017 is in a certain period you can do as below p.start_time < pd.Timestamp('Aug-11-2017')<p.end_time#this will return true #similar to date range we can have period range pd.period_range('2017-02-01 10:15', freq='H', periods=10) #convert and use timezones t = someTimestamp.tz_localize(tz='US/Central')#here we are saying that the timezone for someTimestamp is US/Central t.tz_convert('Asia/Tokyo')#here we are converting to Asia/Tokyo timezone #to see the timezones available you can see as below from pytz import common_timezones, all_timezones [tz for tz in all_timezones] #resample dates = pd.date_range('12-APR-2019', periods=9, freq='2W-SUN') df = pd.DataFrame({'Count1' : 100 + np.random.randint(-5,10,9).cumsum(), 'Count2': 100 + np.random.randint(-5, 10, 9)}, index=dates)#creating a dataframe with datetime index df.resample('M').mean()#this code will kinda group data on month and then take mean of it #rolling #you can create a rolling sum or mean using some window roll = df.rolling(window=20)#here you can gove the column name on which to do the rolling instead of index using on= roll.mean()#you can do both the operations in one chaning also #similar to rolling we have something called expanding ?? #changing frequency df.asfreq('W', method='ffill')#this code will update the index of the previos dataframe from byweekly to weekly and to popualte the values for #the new indices we use forward fill ffill which will just copy the previous value to the next newly created index #for getting year month etc from date we use .dt operator ,** .dt operator works on series df['year'] = df['dateTimeColumn'].dt.year #this creates a year column and .dt.year extracts year from dateTimeColumn #null not nulls df['columns'].isnull() df['columns'].notnull() #you can use this like this df[df['col1'].isnull()]#it will return a dataframe where all the values for col1 is null #null filling time series data using interpolate pandas Data.interpolate(inplace=True, axis=0) #** also we can use read_csv and read_table to read files from the internet #groupby #whenever you group by usually the grouped label come as index to not do that we use df.groupby(['colToGroup'], as_index=False)#as_index = False means dont use this as index but create seperate index #to get all the numeric columns numCols = data.describe().columns#as describe will only output numeric columns , to get the object columns we ca use data.describe(include=['O']) #to group data based on month name using date field data.groupby(data['Date field'].dt.strftime('%B')).size() #dummy variables data = pd.get_dummies(data, columns=columns_to_get_dummies_of) data.columns#here you will see all the columns + columns which are created using get_dummies #make dummies data = pd.get_dummies(data, columns='columnnames') dummies = pd.get_dummies(data['columnname'], prefix='anything') #use columns usecols=np.arange(1,21)#we can give something like this also on pd.read_csv usecols="A:C,D,F,G:I"#in read_excel you can also read columns using the column names A, B etc #some commmon functions df.shape#gives the shape rows * columns df.info()#it will column names and how many non null values and data type df.describe()#gives numerical columns and their counts max min quantiles mean std #for getting the detail of non numeric column df.describe(include=['O'])#means include object type df.describe(include=np.object)#same as above #loading huge data using iterator object, let us say we have some million records we cant load all at once if our system is not capable #also we want to load only certain data, so loading full data and then filtering is a overhead a better solution data_temp=pd.read_csv(dataPath, iterator=True, chunksize=1000, usecols['col1','col2','col3']) data_main=pd.concat([chunk[chunk.col1=2012] for chunk in data_temp]) #what we are doing above is first loading the data using iterator with 1000 rows and few columns and then concatenating the data #1000 rows at a time also we are only loading those rows where col1=2012 #writing to csv output_df = pd.DataFrame({'PassengerId': passengerId,'Survived':prediction}) output_df.to_csv(r'C:\Users\avishek\Downloads\titanicData\RandomForest.csv', header=True, sep=',', index=False) #read excel pd.read_excel(r'C:\Users\avishek\Downloads\housing.xlsx',sheetname='Sheet1',header=None)#here we can also give sheetname #multiple files concatenating into one filelist = os.listdir(r'C:\Users\avishek\Downloads\Datasets') #print(filelist) df_list = [pd.read_csv(r'C:\Users\avishek\Downloads\Datasets\\'+file,header=None) for file in filelist] df = pd.concat(df_list,axis=0) #same thing as above using glob, glob is used for unix style pathname pattern expansion path = r'C:\Users\avishek\Downloads\Datasets' filelist = glob(path + '/*.*.csv') print(filelist) df_list = [pd.read_csv(f, header=None) for f in filelist]

Comments