{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "843a017a", "metadata": {}, "source": [ "Cleaning and Prepping Data" ] }, { "cell_type": "code", "execution_count": 1, "id": "ba6cd5c4", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of CowsNumber of BuffaloesLocation of FarmAverage Daily Milk Production (litres)Milk Collection CentreYearly Expenditure on Animal Health (INR)Yeary Income from Selling Manure (INR)Primary Feed for LivestockSatisfaction with Government SupportMonthly Operating Costs (INR)Monthly Revenue (INR)Use of AutomationNumber of Family Members/Employees Working at the Farm
017211jamnagar1075mother dairy5064243224company products7.09266476967.0no38
14723vadodara350dudhsagar dairy997407011company products3.04092942516.0no39
2117187rajkot1520aavin951107011company products9.08268964924.0yes12
3192130jamnagar1610selling privately to consumers2488140605company products1.019199388905.0yes44
432398ahmedabad2360verka5403819521natural plants5.04701882671.0yes25
\n", "
" ], "text/plain": [ " Number of Cows Number of Buffaloes Location of Farm \\\n", "0 172 11 jamnagar \n", "1 47 23 vadodara \n", "2 117 187 rajkot \n", "3 192 130 jamnagar \n", "4 323 98 ahmedabad \n", "\n", " Average Daily Milk Production (litres) Milk Collection Centre \\\n", "0 1075 mother dairy \n", "1 350 dudhsagar dairy \n", "2 1520 aavin \n", "3 1610 selling privately to consumers \n", "4 2360 verka \n", "\n", " Yearly Expenditure on Animal Health (INR) \\\n", "0 50642 \n", "1 99740 \n", "2 95110 \n", "3 24881 \n", "4 54038 \n", "\n", " Yeary Income from Selling Manure (INR) Primary Feed for Livestock \\\n", "0 43224 company products \n", "1 7011 company products \n", "2 7011 company products \n", "3 40605 company products \n", "4 19521 natural plants \n", "\n", " Satisfaction with Government Support Monthly Operating Costs (INR) \\\n", "0 7.0 92664 \n", "1 3.0 40929 \n", "2 9.0 82689 \n", "3 1.0 191993 \n", "4 5.0 47018 \n", "\n", " Monthly Revenue (INR) Use of Automation \\\n", "0 76967.0 no \n", "1 42516.0 no \n", "2 64924.0 yes \n", "3 88905.0 yes \n", "4 82671.0 yes \n", "\n", " Number of Family Members/Employees Working at the Farm \n", "0 38 \n", "1 39 \n", "2 12 \n", "3 44 \n", "4 25 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "file_path = '/Users/dhruvtrivedi/Downloads/Final Project Stat 371/Farm_Data_Gujarat.csv'\n", "farm_data = pd.read_csv(file_path)\n", "\n", "# Display the first few rows\n", "farm_data.head()" ] }, { "cell_type": "code", "execution_count": 2, "id": "e7c6aa30", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "vadodara 60\n", "jamnagar 59\n", "surat 55\n", "ahmedabad 52\n", "rajkot 51\n", "Name: Location of Farm, dtype: int64" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking the frequency of each category in 'Location of Farm'\n", "location_counts = farm_data['Location of Farm'].value_counts()\n", "location_counts\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "4a751c20", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of CowsNumber of BuffaloesAverage Daily Milk Production (litres)Milk Collection CentreYearly Expenditure on Animal Health (INR)Yeary Income from Selling Manure (INR)Primary Feed for LivestockSatisfaction with Government SupportMonthly Operating Costs (INR)Monthly Revenue (INR)Use of AutomationNumber of Family Members/Employees Working at the Farmahmedabadjamnagarrajkotsurat
0172111075mother dairy5064243224company products7.09266476967.0no380100
14723350dudhsagar dairy997407011company products3.04092942516.0no390000
21171871520aavin951107011company products9.08268964924.0yes120010
31921301610selling privately to consumers2488140605company products1.019199388905.0yes440100
4323982360verka5403819521natural plants5.04701882671.0yes251000
\n", "
" ], "text/plain": [ " Number of Cows Number of Buffaloes \\\n", "0 172 11 \n", "1 47 23 \n", "2 117 187 \n", "3 192 130 \n", "4 323 98 \n", "\n", " Average Daily Milk Production (litres) Milk Collection Centre \\\n", "0 1075 mother dairy \n", "1 350 dudhsagar dairy \n", "2 1520 aavin \n", "3 1610 selling privately to consumers \n", "4 2360 verka \n", "\n", " Yearly Expenditure on Animal Health (INR) \\\n", "0 50642 \n", "1 99740 \n", "2 95110 \n", "3 24881 \n", "4 54038 \n", "\n", " Yeary Income from Selling Manure (INR) Primary Feed for Livestock \\\n", "0 43224 company products \n", "1 7011 company products \n", "2 7011 company products \n", "3 40605 company products \n", "4 19521 natural plants \n", "\n", " Satisfaction with Government Support Monthly Operating Costs (INR) \\\n", "0 7.0 92664 \n", "1 3.0 40929 \n", "2 9.0 82689 \n", "3 1.0 191993 \n", "4 5.0 47018 \n", "\n", " Monthly Revenue (INR) Use of Automation \\\n", "0 76967.0 no \n", "1 42516.0 no \n", "2 64924.0 yes \n", "3 88905.0 yes \n", "4 82671.0 yes \n", "\n", " Number of Family Members/Employees Working at the Farm ahmedabad \\\n", "0 38 0 \n", "1 39 0 \n", "2 12 0 \n", "3 44 0 \n", "4 25 1 \n", "\n", " jamnagar rajkot surat \n", "0 1 0 0 \n", "1 0 0 0 \n", "2 0 1 0 \n", "3 1 0 0 \n", "4 0 0 0 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Correcting the dummy variables for 'Location of Farm', with 'Vadodara' as the baseline\n", "location_dummies_corrected = pd.get_dummies(farm_data['Location of Farm']).drop(['vadodara'], axis=1)\n", "\n", "# Removing the original 'Location of Farm' column\n", "transformed_farm_data = farm_data.drop('Location of Farm', axis=1).join(location_dummies_corrected)\n", "transformed_farm_data.head()\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "a02ae613", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Satisfaction_1_4 145\n", "Satisfaction_8_10 77\n", "Satisfaction_5_7 70\n", "Name: satisfaction_with_government_support, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Categorizing 'Satisfaction with Government Support' into three groups\n", "def categorize_satisfaction(score):\n", " if 1 <= score <= 4:\n", " return 'Satisfaction_1_4'\n", " elif 5 <= score <= 7:\n", " return 'Satisfaction_5_7'\n", " elif 8 <= score <= 10:\n", " return 'Satisfaction_8_10'\n", "\n", "# Applying the categorization\n", "transformed_farm_data['satisfaction_with_government_support'] = farm_data['Satisfaction with Government Support'].apply(categorize_satisfaction)\n", "\n", "# Checking the frequency \n", "satisfaction_counts = transformed_farm_data['satisfaction_with_government_support'].value_counts()\n", "satisfaction_counts\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "85dca4e7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "kwality limited 36\n", "aavin 33\n", "mother dairy 32\n", "parag milk foods ltd 31\n", "orissa state cooperative milk producers federation 29\n", "amul 28\n", "selling privately to consumers 27\n", "dudhsagar dairy 24\n", "karnataka co-operative milk federation 20\n", "verka 16\n", "dynamix dairy 16\n", "milk collection centre 7\n", "Name: Milk Collection Centre, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Dropping the original 'Satisfaction with Government Support' variable\n", "transformed_farm_data = transformed_farm_data.drop('Satisfaction with Government Support', axis=1)\n", "\n", "# Proceeding with the 'Milk Collection Centre' variable\n", "# Checking the frequency of each category in 'Milk Collection Centre'\n", "milk_collection_counts = transformed_farm_data['Milk Collection Centre'].value_counts()\n", "milk_collection_counts\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "000b2105", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of CowsNumber of BuffaloesAverage Daily Milk Production (litres)Yearly Expenditure on Animal Health (INR)Yeary Income from Selling Manure (INR)Primary Feed for LivestockMonthly Operating Costs (INR)Monthly Revenue (INR)Use of AutomationNumber of Family Members/Employees Working at the Farm...amuldudhsagar dairydynamix dairykarnataka co-operative milk federationmilk collection centremother dairyorissa state cooperative milk producers federationparag milk foods ltdselling privately to consumersverka
01721110755064243224company products9266476967.0no38...0000010000
14723350997407011company products4092942516.0no39...0100000000
21171871520951107011company products8268964924.0yes12...0000000000
319213016102488140605company products19199388905.0yes44...0000000010
43239823605403819521natural plants4701882671.0yes25...0000000001
\n", "

5 rows × 26 columns

\n", "
" ], "text/plain": [ " Number of Cows Number of Buffaloes \\\n", "0 172 11 \n", "1 47 23 \n", "2 117 187 \n", "3 192 130 \n", "4 323 98 \n", "\n", " Average Daily Milk Production (litres) \\\n", "0 1075 \n", "1 350 \n", "2 1520 \n", "3 1610 \n", "4 2360 \n", "\n", " Yearly Expenditure on Animal Health (INR) \\\n", "0 50642 \n", "1 99740 \n", "2 95110 \n", "3 24881 \n", "4 54038 \n", "\n", " Yeary Income from Selling Manure (INR) Primary Feed for Livestock \\\n", "0 43224 company products \n", "1 7011 company products \n", "2 7011 company products \n", "3 40605 company products \n", "4 19521 natural plants \n", "\n", " Monthly Operating Costs (INR) Monthly Revenue (INR) Use of Automation \\\n", "0 92664 76967.0 no \n", "1 40929 42516.0 no \n", "2 82689 64924.0 yes \n", "3 191993 88905.0 yes \n", "4 47018 82671.0 yes \n", "\n", " Number of Family Members/Employees Working at the Farm ... amul \\\n", "0 38 ... 0 \n", "1 39 ... 0 \n", "2 12 ... 0 \n", "3 44 ... 0 \n", "4 25 ... 0 \n", "\n", " dudhsagar dairy dynamix dairy karnataka co-operative milk federation \\\n", "0 0 0 0 \n", "1 1 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " milk collection centre mother dairy \\\n", "0 0 1 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " orissa state cooperative milk producers federation parag milk foods ltd \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " selling privately to consumers verka \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 1 0 \n", "4 0 1 \n", "\n", "[5 rows x 26 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transformed_farm_data['Milk Collection Centre'] = transformed_farm_data['Milk Collection Centre'].replace('Milk Collection Centre', pd.NA)\n", "\n", "# Correcting the dummy variables for 'Milk Collection Centre', with the most frequent category ('Kwality Limited') as the baseline\n", "milk_collection_dummies = pd.get_dummies(transformed_farm_data['Milk Collection Centre']).drop(['kwality limited'], axis=1)\n", "\n", "# Removing the original 'Milk Collection Centre' variable\n", "transformed_farm_data = transformed_farm_data.drop('Milk Collection Centre', axis=1).join(milk_collection_dummies)\n", "\n", "# Displaying the first few rows to verify the changes\n", "transformed_farm_data.head()\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "01b918c4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(company products 142\n", " natural plants 131\n", " Name: Primary Feed for Livestock, dtype: int64,\n", " no 160\n", " yes 113\n", " Name: Use of Automation, dtype: int64)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking the frequency \n", "primary_feed_counts = transformed_farm_data['Primary Feed for Livestock'].value_counts()\n", "\n", "# Checking the frequency\n", "automation_counts = transformed_farm_data['Use of Automation'].value_counts()\n", "\n", "primary_feed_counts, automation_counts\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "fda822b5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of CowsNumber of BuffaloesAverage Daily Milk Production (litres)Yearly Expenditure on Animal Health (INR)Yeary Income from Selling Manure (INR)Monthly Operating Costs (INR)Monthly Revenue (INR)Number of Family Members/Employees Working at the Farmahmedabadjamnagar...dynamix dairykarnataka co-operative milk federationmilk collection centremother dairyorissa state cooperative milk producers federationparag milk foods ltdselling privately to consumersverkanatural plantsyes
017211107550642432249266476967.03801...0001000000
147233509974070114092942516.03900...0000000000
211718715209511070118268964924.01200...0000000001
31921301610248814060519199388905.04401...0000001001
432398236054038195214701882671.02510...0000000111
\n", "

5 rows × 26 columns

\n", "
" ], "text/plain": [ " Number of Cows Number of Buffaloes \\\n", "0 172 11 \n", "1 47 23 \n", "2 117 187 \n", "3 192 130 \n", "4 323 98 \n", "\n", " Average Daily Milk Production (litres) \\\n", "0 1075 \n", "1 350 \n", "2 1520 \n", "3 1610 \n", "4 2360 \n", "\n", " Yearly Expenditure on Animal Health (INR) \\\n", "0 50642 \n", "1 99740 \n", "2 95110 \n", "3 24881 \n", "4 54038 \n", "\n", " Yeary Income from Selling Manure (INR) Monthly Operating Costs (INR) \\\n", "0 43224 92664 \n", "1 7011 40929 \n", "2 7011 82689 \n", "3 40605 191993 \n", "4 19521 47018 \n", "\n", " Monthly Revenue (INR) \\\n", "0 76967.0 \n", "1 42516.0 \n", "2 64924.0 \n", "3 88905.0 \n", "4 82671.0 \n", "\n", " Number of Family Members/Employees Working at the Farm ahmedabad \\\n", "0 38 0 \n", "1 39 0 \n", "2 12 0 \n", "3 44 0 \n", "4 25 1 \n", "\n", " jamnagar ... dynamix dairy karnataka co-operative milk federation \\\n", "0 1 ... 0 0 \n", "1 0 ... 0 0 \n", "2 0 ... 0 0 \n", "3 1 ... 0 0 \n", "4 0 ... 0 0 \n", "\n", " milk collection centre mother dairy \\\n", "0 0 1 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " orissa state cooperative milk producers federation parag milk foods ltd \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " selling privately to consumers verka natural plants yes \n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 1 \n", "3 1 0 0 1 \n", "4 0 1 1 1 \n", "\n", "[5 rows x 26 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 'Company Products' as the baseline\n", "primary_feed_dummies = pd.get_dummies(transformed_farm_data['Primary Feed for Livestock']).drop(['company products'], axis=1)\n", "\n", "# 'No' as the baseline\n", "automation_dummies = pd.get_dummies(transformed_farm_data['Use of Automation']).drop(['no'], axis=1)\n", "\n", "# Removing the original 'Primary Feed for Livestock' and 'Use of Automation' variables\n", "transformed_farm_data = transformed_farm_data.drop(['Primary Feed for Livestock', 'Use of Automation'], axis=1)\n", "\n", "# Adding the new dummy variables\n", "transformed_farm_data = transformed_farm_data.join(primary_feed_dummies).join(automation_dummies)\n", "\n", "# Display\n", "transformed_farm_data.head()\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "80561131", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(299, 7)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total number of rows in the dataset\n", "total_rows = transformed_farm_data.shape[0]\n", "\n", "# Counting the number of rows with NA entries\n", "rows_with_na = transformed_farm_data.isna().any(axis=1).sum()\n", "\n", "total_rows, rows_with_na\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "e00c9f9e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "292" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Removing rows with NA entries\n", "transformed_farm_data_cleaned = transformed_farm_data.dropna()\n", "\n", "# Counting the total number of rows after removing NA entries\n", "total_rows_after_removal = transformed_farm_data_cleaned.shape[0]\n", "\n", "total_rows_after_removal\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }