{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cleaning scripts"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"years = list(range(1950, 2100+1, 5))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/cv/xn1my8q111q1ngz5g62wkt_r0000gn/T/ipykernel_1127/1698792024.py:1: DtypeWarning: Columns (2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" popshare = pd.read_csv('raw/WPP2022_PopulationByAge5GroupSex_Percentage_Medium.csv')\n"
]
}
],
"source": [
"popshare = pd.read_csv('raw/WPP2022_PopulationByAge5GroupSex_Percentage_Medium.csv')\n",
"popshare = popshare.loc[popshare['Time'].isin(years)].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/cv/xn1my8q111q1ngz5g62wkt_r0000gn/T/ipykernel_1127/1828745933.py:1: DtypeWarning: Columns (2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" pop = pd.read_csv('raw/WPP2022_PopulationByAge5GroupSex_Medium.csv')\n"
]
}
],
"source": [
"pop = pd.read_csv('raw/WPP2022_PopulationByAge5GroupSex_Medium.csv')\n",
"pop = pop.loc[pop['Time'].isin(years)].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/cv/xn1my8q111q1ngz5g62wkt_r0000gn/T/ipykernel_1127/1733417568.py:1: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" median_age = pd.read_csv('raw/WPP2022_Demographic_Indicators_Medium.csv')\n"
]
}
],
"source": [
"median_age = pd.read_csv('raw/WPP2022_Demographic_Indicators_Medium.csv')\n",
"median_age = median_age.loc[median_age['Time'].isin(years)].reset_index(drop=True)\n",
"median_age = median_age[(median_age['LocTypeID'] == 1) | (median_age['LocTypeID'] == 4)]\n",
"median_age = median_age[['LocID', 'Location', 'Time', 'MedianAgePop']]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"median_age.to_csv('median_age.csv', index=False)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wrangle into relational database"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"age_groups = popshare[['AgeGrp']]\n",
"age_groups = age_groups.drop_duplicates(ignore_index=True)\n",
"age_groups.to_csv('age_groups.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" AgeGrp | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0-4 | \n",
"
\n",
" \n",
" 1 | \n",
" 5-9 | \n",
"
\n",
" \n",
" 2 | \n",
" 10-14 | \n",
"
\n",
" \n",
" 3 | \n",
" 15-19 | \n",
"
\n",
" \n",
" 4 | \n",
" 20-24 | \n",
"
\n",
" \n",
" 5 | \n",
" 25-29 | \n",
"
\n",
" \n",
" 6 | \n",
" 30-34 | \n",
"
\n",
" \n",
" 7 | \n",
" 35-39 | \n",
"
\n",
" \n",
" 8 | \n",
" 40-44 | \n",
"
\n",
" \n",
" 9 | \n",
" 45-49 | \n",
"
\n",
" \n",
" 10 | \n",
" 50-54 | \n",
"
\n",
" \n",
" 11 | \n",
" 55-59 | \n",
"
\n",
" \n",
" 12 | \n",
" 60-64 | \n",
"
\n",
" \n",
" 13 | \n",
" 65-69 | \n",
"
\n",
" \n",
" 14 | \n",
" 70-74 | \n",
"
\n",
" \n",
" 15 | \n",
" 75-79 | \n",
"
\n",
" \n",
" 16 | \n",
" 80-84 | \n",
"
\n",
" \n",
" 17 | \n",
" 85-89 | \n",
"
\n",
" \n",
" 18 | \n",
" 90-94 | \n",
"
\n",
" \n",
" 19 | \n",
" 95-99 | \n",
"
\n",
" \n",
" 20 | \n",
" 100+ | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" AgeGrp\n",
"0 0-4\n",
"1 5-9\n",
"2 10-14\n",
"3 15-19\n",
"4 20-24\n",
"5 25-29\n",
"6 30-34\n",
"7 35-39\n",
"8 40-44\n",
"9 45-49\n",
"10 50-54\n",
"11 55-59\n",
"12 60-64\n",
"13 65-69\n",
"14 70-74\n",
"15 75-79\n",
"16 80-84\n",
"17 85-89\n",
"18 90-94\n",
"19 95-99\n",
"20 100+"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"age_groups"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" LocID | \n",
" ParentID | \n",
" Location | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 900 | \n",
" 0 | \n",
" World | \n",
"
\n",
" \n",
" 1 | \n",
" 108 | \n",
" 910 | \n",
" Burundi | \n",
"
\n",
" \n",
" 2 | \n",
" 174 | \n",
" 910 | \n",
" Comoros | \n",
"
\n",
" \n",
" 3 | \n",
" 262 | \n",
" 910 | \n",
" Djibouti | \n",
"
\n",
" \n",
" 4 | \n",
" 232 | \n",
" 910 | \n",
" Eritrea | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 232 | \n",
" 882 | \n",
" 957 | \n",
" Samoa | \n",
"
\n",
" \n",
" 233 | \n",
" 772 | \n",
" 957 | \n",
" Tokelau | \n",
"
\n",
" \n",
" 234 | \n",
" 776 | \n",
" 957 | \n",
" Tonga | \n",
"
\n",
" \n",
" 235 | \n",
" 798 | \n",
" 957 | \n",
" Tuvalu | \n",
"
\n",
" \n",
" 236 | \n",
" 876 | \n",
" 957 | \n",
" Wallis and Futuna Islands | \n",
"
\n",
" \n",
"
\n",
"
237 rows × 3 columns
\n",
"
"
],
"text/plain": [
" LocID ParentID Location\n",
"0 900 0 World\n",
"1 108 910 Burundi\n",
"2 174 910 Comoros\n",
"3 262 910 Djibouti\n",
"4 232 910 Eritrea\n",
".. ... ... ...\n",
"232 882 957 Samoa\n",
"233 772 957 Tokelau\n",
"234 776 957 Tonga\n",
"235 798 957 Tuvalu\n",
"236 876 957 Wallis and Futuna Islands\n",
"\n",
"[237 rows x 3 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"countries = popshare[(popshare['LocTypeID'] == 1) | (popshare['LocTypeID'] == 4)]\n",
"countries = countries[['LocID', 'ParentID', 'Location']]\n",
"countries = countries.drop_duplicates(subset=['LocID'], ignore_index=True)\n",
"countries = countries.dropna(ignore_index=True)\n",
"countries"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"countries.to_csv('countries.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" LocID | \n",
" Location | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 903 | \n",
" Africa | \n",
"
\n",
" \n",
" 1 | \n",
" 935 | \n",
" Asia | \n",
"
\n",
" \n",
" 2 | \n",
" 908 | \n",
" Europe | \n",
"
\n",
" \n",
" 3 | \n",
" 904 | \n",
" Latin America and the Caribbean | \n",
"
\n",
" \n",
" 4 | \n",
" 905 | \n",
" Northern America | \n",
"
\n",
" \n",
" 5 | \n",
" 909 | \n",
" Oceania | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" LocID Location\n",
"0 903 Africa\n",
"1 935 Asia\n",
"2 908 Europe\n",
"3 904 Latin America and the Caribbean\n",
"4 905 Northern America\n",
"5 909 Oceania"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"continents = popshare[popshare['LocTypeID'] == 2]\n",
"continents = continents[['LocID', 'Location']]\n",
"continents = continents.drop_duplicates(subset=['LocID'], ignore_index=True)\n",
"continents = continents.dropna(ignore_index=True)\n",
"continents"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" LocID | \n",
" ParentID | \n",
" Location | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 910 | \n",
" 903 | \n",
" Eastern Africa | \n",
"
\n",
" \n",
" 1 | \n",
" 911 | \n",
" 903 | \n",
" Middle Africa | \n",
"
\n",
" \n",
" 2 | \n",
" 912 | \n",
" 903 | \n",
" Northern Africa | \n",
"
\n",
" \n",
" 3 | \n",
" 913 | \n",
" 903 | \n",
" Southern Africa | \n",
"
\n",
" \n",
" 4 | \n",
" 914 | \n",
" 903 | \n",
" Western Africa | \n",
"
\n",
" \n",
" 5 | \n",
" 5500 | \n",
" 935 | \n",
" Central Asia | \n",
"
\n",
" \n",
" 6 | \n",
" 906 | \n",
" 935 | \n",
" Eastern Asia | \n",
"
\n",
" \n",
" 7 | \n",
" 5501 | \n",
" 935 | \n",
" Southern Asia | \n",
"
\n",
" \n",
" 8 | \n",
" 920 | \n",
" 935 | \n",
" South-Eastern Asia | \n",
"
\n",
" \n",
" 9 | \n",
" 922 | \n",
" 935 | \n",
" Western Asia | \n",
"
\n",
" \n",
" 10 | \n",
" 923 | \n",
" 908 | \n",
" Eastern Europe | \n",
"
\n",
" \n",
" 11 | \n",
" 924 | \n",
" 908 | \n",
" Northern Europe | \n",
"
\n",
" \n",
" 12 | \n",
" 925 | \n",
" 908 | \n",
" Southern Europe | \n",
"
\n",
" \n",
" 13 | \n",
" 926 | \n",
" 908 | \n",
" Western Europe | \n",
"
\n",
" \n",
" 14 | \n",
" 915 | \n",
" 904 | \n",
" Caribbean | \n",
"
\n",
" \n",
" 15 | \n",
" 916 | \n",
" 904 | \n",
" Central America | \n",
"
\n",
" \n",
" 16 | \n",
" 931 | \n",
" 904 | \n",
" South America | \n",
"
\n",
" \n",
" 17 | \n",
" 927 | \n",
" 909 | \n",
" Australia/New Zealand | \n",
"
\n",
" \n",
" 18 | \n",
" 928 | \n",
" 909 | \n",
" Melanesia | \n",
"
\n",
" \n",
" 19 | \n",
" 954 | \n",
" 909 | \n",
" Micronesia | \n",
"
\n",
" \n",
" 20 | \n",
" 957 | \n",
" 909 | \n",
" Polynesia | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" LocID ParentID Location\n",
"0 910 903 Eastern Africa\n",
"1 911 903 Middle Africa\n",
"2 912 903 Northern Africa\n",
"3 913 903 Southern Africa\n",
"4 914 903 Western Africa\n",
"5 5500 935 Central Asia\n",
"6 906 935 Eastern Asia\n",
"7 5501 935 Southern Asia\n",
"8 920 935 South-Eastern Asia\n",
"9 922 935 Western Asia\n",
"10 923 908 Eastern Europe\n",
"11 924 908 Northern Europe\n",
"12 925 908 Southern Europe\n",
"13 926 908 Western Europe\n",
"14 915 904 Caribbean\n",
"15 916 904 Central America\n",
"16 931 904 South America\n",
"17 927 909 Australia/New Zealand\n",
"18 928 909 Melanesia\n",
"19 954 909 Micronesia\n",
"20 957 909 Polynesia"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"regions = popshare[popshare['LocTypeID'] == 3]\n",
"regions = regions[['LocID', 'ParentID', 'Location']]\n",
"regions = regions.drop_duplicates(subset=['LocID'], ignore_index=True)\n",
"regions = regions.dropna(ignore_index=True)\n",
"regions"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Consolidate and export"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"pop_world = pop[pop['Location'] == 'World']\n",
"pop_world = pop_world[\n",
" ['LocID', 'Time', 'AgeGrp', 'PopMale', 'PopFemale']\n",
"].reset_index(drop=True)\n",
"\n",
"popshare_world = popshare[popshare['Location'] == 'World']\n",
"popshare_world = popshare_world[\n",
" ['LocID', 'Location', 'ParentID', 'Time', 'AgeGrp', 'PopMale', 'PopFemale']\n",
"].reset_index(drop=True)\n",
"popshare_world = popshare_world.rename(columns={'PopMale': 'PopShareMale', 'PopFemale': 'PopShareFemale'})"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"pop_countries = pop[pop['LocTypeID'] == 4]\n",
"pop_countries = pop_countries[\n",
" ['LocID', 'Time', 'AgeGrp', 'PopMale', 'PopFemale']\n",
"].reset_index(drop=True)\n",
"\n",
"popshare_countries = popshare[popshare['LocTypeID'] == 4]\n",
"popshare_countries = popshare_countries[\n",
" ['LocID', 'Location', 'ParentID', 'Time', 'AgeGrp', 'PopMale', 'PopFemale']\n",
"].reset_index(drop=True)\n",
"popshare_countries = popshare_countries.rename(columns={'PopMale': 'PopShareMale', 'PopFemale': 'PopShareFemale'})"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"pop_countries = pd.concat([pop_world, pop_countries], ignore_index=True)\n",
"popshare_countries = pd.concat([popshare_world, popshare_countries], ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" LocID | \n",
" Location | \n",
" ParentID | \n",
" Time | \n",
" AgeGrp | \n",
" PopShareMale | \n",
" PopShareFemale | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 0-4 | \n",
" 6.988 | \n",
" 6.691 | \n",
"
\n",
" \n",
" 1 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 5-9 | \n",
" 5.478 | \n",
" 5.234 | \n",
"
\n",
" \n",
" 2 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 10-14 | \n",
" 5.286 | \n",
" 5.040 | \n",
"
\n",
" \n",
" 3 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 15-19 | \n",
" 4.822 | \n",
" 4.668 | \n",
"
\n",
" \n",
" 4 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 20-24 | \n",
" 4.442 | \n",
" 4.365 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 154282 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 80-84 | \n",
" 2.628 | \n",
" 2.689 | \n",
"
\n",
" \n",
" 154283 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 85-89 | \n",
" 2.426 | \n",
" 2.729 | \n",
"
\n",
" \n",
" 154284 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 90-94 | \n",
" 1.728 | \n",
" 2.012 | \n",
"
\n",
" \n",
" 154285 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 95-99 | \n",
" 0.728 | \n",
" 1.071 | \n",
"
\n",
" \n",
" 154286 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 100+ | \n",
" 0.152 | \n",
" 0.354 | \n",
"
\n",
" \n",
"
\n",
"
154287 rows × 7 columns
\n",
"
"
],
"text/plain": [
" LocID Location ParentID Time AgeGrp PopShareMale \n",
"0 900 World 0 1950 0-4 6.988 \\\n",
"1 900 World 0 1950 5-9 5.478 \n",
"2 900 World 0 1950 10-14 5.286 \n",
"3 900 World 0 1950 15-19 4.822 \n",
"4 900 World 0 1950 20-24 4.442 \n",
"... ... ... ... ... ... ... \n",
"154282 876 Wallis and Futuna Islands 957 2100 80-84 2.628 \n",
"154283 876 Wallis and Futuna Islands 957 2100 85-89 2.426 \n",
"154284 876 Wallis and Futuna Islands 957 2100 90-94 1.728 \n",
"154285 876 Wallis and Futuna Islands 957 2100 95-99 0.728 \n",
"154286 876 Wallis and Futuna Islands 957 2100 100+ 0.152 \n",
"\n",
" PopShareFemale \n",
"0 6.691 \n",
"1 5.234 \n",
"2 5.040 \n",
"3 4.668 \n",
"4 4.365 \n",
"... ... \n",
"154282 2.689 \n",
"154283 2.729 \n",
"154284 2.012 \n",
"154285 1.071 \n",
"154286 0.354 \n",
"\n",
"[154287 rows x 7 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"popshare_countries"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"pyramid = pd.merge(\n",
" popshare_countries, \n",
" pop_countries, \n",
" on=['LocID', 'Time', 'AgeGrp'], \n",
" how='left'\n",
").reset_index(drop=True)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Add columns to show Male/Female surplus."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" LocID | \n",
" Location | \n",
" ParentID | \n",
" Time | \n",
" AgeGrp | \n",
" PopShareMale | \n",
" PopShareFemale | \n",
" PopMale | \n",
" PopFemale | \n",
" SurplusShareMale | \n",
" SurplusShareFemale | \n",
" SurplusMale | \n",
" SurplusFemale | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 0-4 | \n",
" 6.988 | \n",
" 6.691 | \n",
" 174647.391 | \n",
" 167229.405 | \n",
" 0.297 | \n",
" 0.000 | \n",
" 7417.986 | \n",
" 0.000 | \n",
"
\n",
" \n",
" 1 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 5-9 | \n",
" 5.478 | \n",
" 5.234 | \n",
" 136914.008 | \n",
" 130816.575 | \n",
" 0.244 | \n",
" 0.000 | \n",
" 6097.433 | \n",
" 0.000 | \n",
"
\n",
" \n",
" 2 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 10-14 | \n",
" 5.286 | \n",
" 5.040 | \n",
" 132112.875 | \n",
" 125967.993 | \n",
" 0.246 | \n",
" 0.000 | \n",
" 6144.882 | \n",
" 0.000 | \n",
"
\n",
" \n",
" 3 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 15-19 | \n",
" 4.822 | \n",
" 4.668 | \n",
" 120512.860 | \n",
" 116660.508 | \n",
" 0.154 | \n",
" 0.000 | \n",
" 3852.352 | \n",
" 0.000 | \n",
"
\n",
" \n",
" 4 | \n",
" 900 | \n",
" World | \n",
" 0 | \n",
" 1950 | \n",
" 20-24 | \n",
" 4.442 | \n",
" 4.365 | \n",
" 111019.768 | \n",
" 109095.897 | \n",
" 0.077 | \n",
" 0.000 | \n",
" 1923.871 | \n",
" 0.000 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 154282 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 80-84 | \n",
" 2.628 | \n",
" 2.689 | \n",
" 0.260 | \n",
" 0.266 | \n",
" 0.000 | \n",
" 0.061 | \n",
" 0.000 | \n",
" 0.006 | \n",
"
\n",
" \n",
" 154283 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 85-89 | \n",
" 2.426 | \n",
" 2.729 | \n",
" 0.240 | \n",
" 0.270 | \n",
" 0.000 | \n",
" 0.303 | \n",
" 0.000 | \n",
" 0.030 | \n",
"
\n",
" \n",
" 154284 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 90-94 | \n",
" 1.728 | \n",
" 2.012 | \n",
" 0.171 | \n",
" 0.199 | \n",
" 0.000 | \n",
" 0.284 | \n",
" 0.000 | \n",
" 0.028 | \n",
"
\n",
" \n",
" 154285 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 95-99 | \n",
" 0.728 | \n",
" 1.071 | \n",
" 0.072 | \n",
" 0.106 | \n",
" 0.000 | \n",
" 0.343 | \n",
" 0.000 | \n",
" 0.034 | \n",
"
\n",
" \n",
" 154286 | \n",
" 876 | \n",
" Wallis and Futuna Islands | \n",
" 957 | \n",
" 2100 | \n",
" 100+ | \n",
" 0.152 | \n",
" 0.354 | \n",
" 0.015 | \n",
" 0.035 | \n",
" 0.000 | \n",
" 0.202 | \n",
" 0.000 | \n",
" 0.020 | \n",
"
\n",
" \n",
"
\n",
"
154287 rows × 13 columns
\n",
"
"
],
"text/plain": [
" LocID Location ParentID Time AgeGrp PopShareMale \n",
"0 900 World 0 1950 0-4 6.988 \\\n",
"1 900 World 0 1950 5-9 5.478 \n",
"2 900 World 0 1950 10-14 5.286 \n",
"3 900 World 0 1950 15-19 4.822 \n",
"4 900 World 0 1950 20-24 4.442 \n",
"... ... ... ... ... ... ... \n",
"154282 876 Wallis and Futuna Islands 957 2100 80-84 2.628 \n",
"154283 876 Wallis and Futuna Islands 957 2100 85-89 2.426 \n",
"154284 876 Wallis and Futuna Islands 957 2100 90-94 1.728 \n",
"154285 876 Wallis and Futuna Islands 957 2100 95-99 0.728 \n",
"154286 876 Wallis and Futuna Islands 957 2100 100+ 0.152 \n",
"\n",
" PopShareFemale PopMale PopFemale SurplusShareMale \n",
"0 6.691 174647.391 167229.405 0.297 \\\n",
"1 5.234 136914.008 130816.575 0.244 \n",
"2 5.040 132112.875 125967.993 0.246 \n",
"3 4.668 120512.860 116660.508 0.154 \n",
"4 4.365 111019.768 109095.897 0.077 \n",
"... ... ... ... ... \n",
"154282 2.689 0.260 0.266 0.000 \n",
"154283 2.729 0.240 0.270 0.000 \n",
"154284 2.012 0.171 0.199 0.000 \n",
"154285 1.071 0.072 0.106 0.000 \n",
"154286 0.354 0.015 0.035 0.000 \n",
"\n",
" SurplusShareFemale SurplusMale SurplusFemale \n",
"0 0.000 7417.986 0.000 \n",
"1 0.000 6097.433 0.000 \n",
"2 0.000 6144.882 0.000 \n",
"3 0.000 3852.352 0.000 \n",
"4 0.000 1923.871 0.000 \n",
"... ... ... ... \n",
"154282 0.061 0.000 0.006 \n",
"154283 0.303 0.000 0.030 \n",
"154284 0.284 0.000 0.028 \n",
"154285 0.343 0.000 0.034 \n",
"154286 0.202 0.000 0.020 \n",
"\n",
"[154287 rows x 13 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pyramid[['SurplusShareMale', 'SurplusShareFemale', 'SurplusMale', 'SurplusFemale']] = pyramid.apply(\n",
" lambda row: [\n",
" row['PopShareMale'] - row['PopShareFemale'] if row['PopShareMale'] > row['PopShareFemale'] else 0, \n",
" row['PopShareFemale'] - row['PopShareMale'] if row['PopShareFemale'] > row['PopShareMale'] else 0,\n",
" row['PopMale'] - row['PopFemale'] if row['PopMale'] > row['PopFemale'] else 0, \n",
" row['PopFemale'] - row['PopMale'] if row['PopFemale'] > row['PopMale'] else 0\n",
" ], \n",
" axis=1, \n",
" result_type='expand'\n",
")\n",
"pyramid"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"pyramid.to_csv('pyramid.csv', index=False)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fertility"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/cv/xn1my8q111q1ngz5g62wkt_r0000gn/T/ipykernel_1127/3541316337.py:1: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" fertility = pd.read_csv('raw/WPP2022_Demographic_Indicators_Medium.csv')\n"
]
}
],
"source": [
"fertility = pd.read_csv('raw/WPP2022_Demographic_Indicators_Medium.csv')\n",
"fertility = fertility.loc[fertility['Time'].isin(list(range(1950, 2022+1)))].reset_index(drop=True)\n",
"\n",
"fertility_regions = fertility[(fertility['LocTypeID'] == 1) | (fertility['LocTypeID'] == 2)]\n",
"fertility_countries = fertility[fertility['LocTypeID'] == 4]\n",
"\n",
"fertility_regions = fertility_regions[['LocID', 'Location', 'Time', 'TFR']]\n",
"fertility_countries = fertility_countries[['LocID', 'Location', 'Time', 'TFR']]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"fertility_regions.to_csv('fertility_regions.csv', index=False)\n",
"fertility_countries.to_csv('fertility_countries.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "twopoints-venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}