diff --git a/Labs/tarea6/GRUPO3_T6.ipynb b/Labs/tarea6/GRUPO3_T6.ipynb new file mode 100644 index 0000000..fa42696 --- /dev/null +++ b/Labs/tarea6/GRUPO3_T6.ipynb @@ -0,0 +1,1133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4137dd69", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import re \n", + "import warnings\n", + "warnings.filterwarnings('ignore') " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2ef139e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numpernumpanh15numpanh16numpanh17numpanh18numpanh19mes_15ubigeo_15dominio_15p400a3_15...mes_18ubigeo_18dominio_18p400a3_18p4022_18mes_19ubigeo_19dominio_19p400a3_19p4022_19
015368115811581158115811581110101sierra norte1946.0...1210101sierra norte1946.00.01110101sierra norte1946.00.0
115369115811581158115811581110101sierra norte1973.0...1210101sierra norte1973.00.01110101sierra norte1973.00.0
215380116211621162116211621110101sierra norte1987.0...1210101sierra norte1987.00.01110101sierra norte1987.01.0
315381116211621162116211621110101sierra norte2009.0...1210101sierra norte2009.00.01110101sierra norte2009.00.0
41541011851185118511851185710101sierra norte1955.0...710101sierra norte1955.00.0710101sierra norte1955.01.0
..................................................................
554134848185491854918549185491854912250101selva2002.0...12250101selva2002.00.010250101selva2002.01.0
554234846185491854918549185491854912250101selva1985.0...12250101selva1985.00.010250101selva1985.00.0
554334847185491854918549185491854912250101selva1976.0...12250101selva1976.00.010250101selva1976.01.0
554434849185491854918549185491854912250101selva2007.0...12250101selva2007.00.010250101selva2007.01.0
554534850185491854918549185491854912250101selva2011.0...12250101selva2011.01.010250101selva2011.00.0
\n", + "

5546 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " numper numpanh15 numpanh16 numpanh17 numpanh18 numpanh19 mes_15 \\\n", + "0 15368 1158 1158 1158 1158 1158 11 \n", + "1 15369 1158 1158 1158 1158 1158 11 \n", + "2 15380 1162 1162 1162 1162 1162 11 \n", + "3 15381 1162 1162 1162 1162 1162 11 \n", + "4 15410 1185 1185 1185 1185 1185 7 \n", + "... ... ... ... ... ... ... ... \n", + "5541 34848 18549 18549 18549 18549 18549 12 \n", + "5542 34846 18549 18549 18549 18549 18549 12 \n", + "5543 34847 18549 18549 18549 18549 18549 12 \n", + "5544 34849 18549 18549 18549 18549 18549 12 \n", + "5545 34850 18549 18549 18549 18549 18549 12 \n", + "\n", + " ubigeo_15 dominio_15 p400a3_15 ... mes_18 ubigeo_18 \\\n", + "0 10101 sierra norte 1946.0 ... 12 10101 \n", + "1 10101 sierra norte 1973.0 ... 12 10101 \n", + "2 10101 sierra norte 1987.0 ... 12 10101 \n", + "3 10101 sierra norte 2009.0 ... 12 10101 \n", + "4 10101 sierra norte 1955.0 ... 7 10101 \n", + "... ... ... ... ... ... ... \n", + "5541 250101 selva 2002.0 ... 12 250101 \n", + "5542 250101 selva 1985.0 ... 12 250101 \n", + "5543 250101 selva 1976.0 ... 12 250101 \n", + "5544 250101 selva 2007.0 ... 12 250101 \n", + "5545 250101 selva 2011.0 ... 12 250101 \n", + "\n", + " dominio_18 p400a3_18 p4022_18 mes_19 ubigeo_19 dominio_19 \\\n", + "0 sierra norte 1946.0 0.0 11 10101 sierra norte \n", + "1 sierra norte 1973.0 0.0 11 10101 sierra norte \n", + "2 sierra norte 1987.0 0.0 11 10101 sierra norte \n", + "3 sierra norte 2009.0 0.0 11 10101 sierra norte \n", + "4 sierra norte 1955.0 0.0 7 10101 sierra norte \n", + "... ... ... ... ... ... ... \n", + "5541 selva 2002.0 0.0 10 250101 selva \n", + "5542 selva 1985.0 0.0 10 250101 selva \n", + "5543 selva 1976.0 0.0 10 250101 selva \n", + "5544 selva 2007.0 0.0 10 250101 selva \n", + "5545 selva 2011.0 1.0 10 250101 selva \n", + "\n", + " p400a3_19 p4022_19 \n", + "0 1946.0 0.0 \n", + "1 1973.0 0.0 \n", + "2 1987.0 1.0 \n", + "3 2009.0 0.0 \n", + "4 1955.0 1.0 \n", + "... ... ... \n", + "5541 2002.0 1.0 \n", + "5542 1985.0 0.0 \n", + "5543 1976.0 1.0 \n", + "5544 2007.0 1.0 \n", + "5545 2011.0 0.0 \n", + "\n", + "[5546 rows x 31 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "panel = pd.read_stata(\"C:/Users/ALICIA/Documents/GitHub/ultima tarea/data/data.dta\",convert_categoricals=False)\n", + "panel" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cce469c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['numper', 'numpanh15', 'numpanh16', 'numpanh17', 'numpanh18',\n", + " 'numpanh19', 'mes_15', 'ubigeo_15', 'dominio_15', 'p400a3_15',\n", + " 'p4022_15', 'mes_16', 'ubigeo_16', 'dominio_16', 'p400a3_16',\n", + " 'p4022_16', 'mes_17', 'ubigeo_17', 'dominio_17', 'p400a3_17',\n", + " 'p4022_17', 'mes_18', 'ubigeo_18', 'dominio_18', 'p400a3_18',\n", + " 'p4022_18', 'mes_19', 'ubigeo_19', 'dominio_19', 'p400a3_19',\n", + " 'p4022_19'],\n", + " dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "panel.columns[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9b9d749e", + "metadata": {}, + "outputs": [], + "source": [ + "panel.rename(columns = {'numpanh15':'numpanh_15', 'numpanh16':'numpanh_16','numpanh17':'numpanh_17','numpanh18':'numpanh_18','numpanh19':'numpanh_19'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f7412527", + "metadata": {}, + "outputs": [], + "source": [ + "filter_list = list(panel.columns)[:] " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7e38e765", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['numper',\n", + " 'numpanh_15',\n", + " 'numpanh_16',\n", + " 'numpanh_17',\n", + " 'numpanh_18',\n", + " 'numpanh_19',\n", + " 'mes_15',\n", + " 'ubigeo_15',\n", + " 'dominio_15',\n", + " 'p400a3_15',\n", + " 'p4022_15',\n", + " 'mes_16',\n", + " 'ubigeo_16',\n", + " 'dominio_16',\n", + " 'p400a3_16',\n", + " 'p4022_16',\n", + " 'mes_17',\n", + " 'ubigeo_17',\n", + " 'dominio_17',\n", + " 'p400a3_17',\n", + " 'p4022_17',\n", + " 'mes_18',\n", + " 'ubigeo_18',\n", + " 'dominio_18',\n", + " 'p400a3_18',\n", + " 'p4022_18',\n", + " 'mes_19',\n", + " 'ubigeo_19',\n", + " 'dominio_19',\n", + " 'p400a3_19',\n", + " 'p4022_19']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_list = list(dict.fromkeys(filter_list))\n", + "new_list" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "416a041f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numperperiodnumpanhmesubigeodominiop400a3p4022
0153681511581110101sierra norte1946.00.0
1153691511581110101sierra norte1973.00.0
2153801511621110101sierra norte1987.01.0
3153811511621110101sierra norte2009.01.0
415410151185710101sierra norte1955.00.0
...........................
2772534848191854910250101selva2002.01.0
2772634846191854910250101selva1985.00.0
2772734847191854910250101selva1976.01.0
2772834849191854910250101selva2007.01.0
2772934850191854910250101selva2011.00.0
\n", + "

27730 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " numper period numpanh mes ubigeo dominio p400a3 p4022\n", + "0 15368 15 1158 11 10101 sierra norte 1946.0 0.0\n", + "1 15369 15 1158 11 10101 sierra norte 1973.0 0.0\n", + "2 15380 15 1162 11 10101 sierra norte 1987.0 1.0\n", + "3 15381 15 1162 11 10101 sierra norte 2009.0 1.0\n", + "4 15410 15 1185 7 10101 sierra norte 1955.0 0.0\n", + "... ... ... ... ... ... ... ... ...\n", + "27725 34848 19 18549 10 250101 selva 2002.0 1.0\n", + "27726 34846 19 18549 10 250101 selva 1985.0 0.0\n", + "27727 34847 19 18549 10 250101 selva 1976.0 1.0\n", + "27728 34849 19 18549 10 250101 selva 2007.0 1.0\n", + "27729 34850 19 18549 10 250101 selva 2011.0 0.0\n", + "\n", + "[27730 rows x 8 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reshape_panel = pd.wide_to_long(panel, stubnames = ['numpanh','mes','ubigeo','dominio','p400a3','p4022'], i = ['numper'] , \n", + " j = 'period' , sep = '_').reset_index()\n", + "reshape_panel" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e9ab148b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ubigeodistritoprovinciaregionunidos
010101ChachapoyasChachapoyasAmazonas1
110102AsuncionChachapoyasAmazonas1
210103BalsasChachapoyasAmazonas0
310104ChetoChachapoyasAmazonas1
410105ChiliquinChachapoyasAmazonas0
..................
1869250302IrazolaPadre AbadUcayali0
1870250303CurimanaPadre AbadUcayali0
1871250304NeshuyaPadre AbadUcayali0
1872250305Alexander von HumboldtPadre AbadUcayali0
1873250401PurusPurusUcayali1
\n", + "

1874 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " ubigeo distrito provincia region unidos\n", + "0 10101 Chachapoyas Chachapoyas Amazonas 1\n", + "1 10102 Asuncion Chachapoyas Amazonas 1\n", + "2 10103 Balsas Chachapoyas Amazonas 0\n", + "3 10104 Cheto Chachapoyas Amazonas 1\n", + "4 10105 Chiliquin Chachapoyas Amazonas 0\n", + "... ... ... ... ... ...\n", + "1869 250302 Irazola Padre Abad Ucayali 0\n", + "1870 250303 Curimana Padre Abad Ucayali 0\n", + "1871 250304 Neshuya Padre Abad Ucayali 0\n", + "1872 250305 Alexander von Humboldt Padre Abad Ucayali 0\n", + "1873 250401 Purus Purus Ucayali 1\n", + "\n", + "[1874 rows x 5 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unidos = pd.read_stata(\"C:/Users/ALICIA/Documents/GitHub/ultima tarea/data/unidos.dta\",\n", + " convert_categoricals=False)\n", + "unidos" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a13973a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numperperiodnumpanhmesubigeodominiop400a3p4022distritoprovinciaregionunidos
0153681511581110101sierra norte1946.00.0ChachapoyasChachapoyasAmazonas1
1153691511581110101sierra norte1973.00.0ChachapoyasChachapoyasAmazonas1
2153801511621110101sierra norte1987.01.0ChachapoyasChachapoyasAmazonas1
3153811511621110101sierra norte2009.01.0ChachapoyasChachapoyasAmazonas1
415410151185710101sierra norte1955.00.0ChachapoyasChachapoyasAmazonas1
.......................................
277253477719184727220301selva1942.00.0San Jose de SisaEl DoradoSan Martin0
277263477919184737220301selva1988.00.0San Jose de SisaEl DoradoSan Martin0
277273478119184737220301selva2015.00.0San Jose de SisaEl DoradoSan Martin0
277283477819184737220301selva1984.01.0San Jose de SisaEl DoradoSan Martin0
277293478019184737220301selva2010.00.0San Jose de SisaEl DoradoSan Martin0
\n", + "

27730 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " numper period numpanh mes ubigeo dominio p400a3 p4022 \\\n", + "0 15368 15 1158 11 10101 sierra norte 1946.0 0.0 \n", + "1 15369 15 1158 11 10101 sierra norte 1973.0 0.0 \n", + "2 15380 15 1162 11 10101 sierra norte 1987.0 1.0 \n", + "3 15381 15 1162 11 10101 sierra norte 2009.0 1.0 \n", + "4 15410 15 1185 7 10101 sierra norte 1955.0 0.0 \n", + "... ... ... ... ... ... ... ... ... \n", + "27725 34777 19 18472 7 220301 selva 1942.0 0.0 \n", + "27726 34779 19 18473 7 220301 selva 1988.0 0.0 \n", + "27727 34781 19 18473 7 220301 selva 2015.0 0.0 \n", + "27728 34778 19 18473 7 220301 selva 1984.0 1.0 \n", + "27729 34780 19 18473 7 220301 selva 2010.0 0.0 \n", + "\n", + " distrito provincia region unidos \n", + "0 Chachapoyas Chachapoyas Amazonas 1 \n", + "1 Chachapoyas Chachapoyas Amazonas 1 \n", + "2 Chachapoyas Chachapoyas Amazonas 1 \n", + "3 Chachapoyas Chachapoyas Amazonas 1 \n", + "4 Chachapoyas Chachapoyas Amazonas 1 \n", + "... ... ... ... ... \n", + "27725 San Jose de Sisa El Dorado San Martin 0 \n", + "27726 San Jose de Sisa El Dorado San Martin 0 \n", + "27727 San Jose de Sisa El Dorado San Martin 0 \n", + "27728 San Jose de Sisa El Dorado San Martin 0 \n", + "27729 San Jose de Sisa El Dorado San Martin 0 \n", + "\n", + "[27730 rows x 12 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "juntos = pd.merge(reshape_panel, unidos, on='ubigeo', how='inner')\n", + "juntos" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee08714c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Labs/tarea6/GRUPO_3_GoogleCollab.ipynb b/Labs/tarea6/GRUPO_3_GoogleCollab.ipynb new file mode 100644 index 0000000..1889542 --- /dev/null +++ b/Labs/tarea6/GRUPO_3_GoogleCollab.ipynb @@ -0,0 +1,1759 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "!pip install ghostscript\n", + "!pip install camelot-py[cv]\n", + "!pip install excalibur-py\n", + "!apt install ghostscript python3-tk" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mvCaIqLfjQ4D", + "outputId": "e5176b58-3f55-49c5-f836-791cfce83878" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting ghostscript\n", + " Downloading ghostscript-0.7-py2.py3-none-any.whl (25 kB)\n", + "Requirement already satisfied: setuptools>=38.6.0 in /usr/local/lib/python3.10/dist-packages (from ghostscript) (67.7.2)\n", + "Installing collected packages: ghostscript\n", + "Successfully installed ghostscript-0.7\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting camelot-py[cv]\n", + " Downloading camelot_py-0.11.0-py3-none-any.whl (40 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: chardet>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]) (4.0.0)\n", + "Requirement already satisfied: click>=6.7 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]) (8.1.3)\n", + "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]) (1.22.4)\n", + "Requirement already satisfied: openpyxl>=2.5.8 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]) (3.0.10)\n", + "Requirement already satisfied: pandas>=0.23.4 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]) (1.5.3)\n", + "Collecting pdfminer.six>=20200726 (from camelot-py[cv])\n", + " Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m82.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pypdf>=3.0.0 (from camelot-py[cv])\n", + " Downloading pypdf-3.9.1-py3-none-any.whl (249 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m249.3/249.3 kB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]) (0.8.10)\n", + "Requirement already satisfied: ghostscript>=0.7 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]) (0.7)\n", + "Requirement already satisfied: opencv-python>=3.4.2.17 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]) (4.7.0.72)\n", + "INFO: pip is looking at multiple versions of camelot-py[cv] to determine which version is compatible with other requirements. This could take a while.\n", + "Collecting camelot-py[cv]\n", + " Downloading camelot_py-0.10.1-py3-none-any.whl (40 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: PyPDF2>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]) (2.12.1)\n", + " Downloading camelot_py-0.10.0-py3-none-any.whl (40 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading camelot_py-0.9.0-py3-none-any.whl (43 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.2/43.2 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: et-xmlfile in /usr/local/lib/python3.10/dist-packages (from openpyxl>=2.5.8->camelot-py[cv]) (1.1.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.23.4->camelot-py[cv]) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.23.4->camelot-py[cv]) (2022.7.1)\n", + "Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six>=20200726->camelot-py[cv]) (2.0.12)\n", + "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six>=20200726->camelot-py[cv]) (40.0.2)\n", + "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=36.0.0->pdfminer.six>=20200726->camelot-py[cv]) (1.15.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas>=0.23.4->camelot-py[cv]) (1.16.0)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six>=20200726->camelot-py[cv]) (2.21)\n", + "Installing collected packages: pdfminer.six, camelot-py\n", + "Successfully installed camelot-py-0.9.0 pdfminer.six-20221105\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting excalibur-py\n", + " Downloading excalibur_py-0.4.3-py3-none-any.whl (1.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m30.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: camelot-py[cv]>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from excalibur-py) (0.9.0)\n", + "Collecting celery>=4.1.1 (from excalibur-py)\n", + " Downloading celery-5.3.0-py3-none-any.whl (420 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m420.3/420.3 kB\u001b[0m \u001b[31m46.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: Click>=7.0 in /usr/local/lib/python3.10/dist-packages (from excalibur-py) (8.1.3)\n", + "Collecting configparser<3.6.0,>=3.5.0 (from excalibur-py)\n", + " Downloading configparser-3.5.3-py3-none-any.whl (21 kB)\n", + "Requirement already satisfied: Flask>=1.0.2 in /usr/local/lib/python3.10/dist-packages (from excalibur-py) (2.2.4)\n", + "Requirement already satisfied: SQLAlchemy>=1.2.12 in /usr/local/lib/python3.10/dist-packages (from excalibur-py) (2.0.10)\n", + "Collecting Werkzeug<1.0.0 (from excalibur-py)\n", + " Downloading Werkzeug-0.16.1-py2.py3-none-any.whl (327 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m327.4/327.4 kB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: chardet>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]>=0.7.1->excalibur-py) (4.0.0)\n", + "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]>=0.7.1->excalibur-py) (1.22.4)\n", + "Requirement already satisfied: openpyxl>=2.5.8 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]>=0.7.1->excalibur-py) (3.0.10)\n", + "Requirement already satisfied: pandas>=0.23.4 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]>=0.7.1->excalibur-py) (1.5.3)\n", + "Requirement already satisfied: pdfminer.six>=20200726 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]>=0.7.1->excalibur-py) (20221105)\n", + "Requirement already satisfied: PyPDF2>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]>=0.7.1->excalibur-py) (2.12.1)\n", + "Requirement already satisfied: opencv-python>=3.4.2.17 in /usr/local/lib/python3.10/dist-packages (from camelot-py[cv]>=0.7.1->excalibur-py) (4.7.0.72)\n", + "Collecting billiard<5.0,>=4.1.0 (from celery>=4.1.1->excalibur-py)\n", + " Downloading billiard-4.1.0-py3-none-any.whl (86 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.7/86.7 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting click-didyoumean>=0.3.0 (from celery>=4.1.1->excalibur-py)\n", + " Downloading click_didyoumean-0.3.0-py3-none-any.whl (2.7 kB)\n", + "Collecting click-plugins>=1.1.1 (from celery>=4.1.1->excalibur-py)\n", + " Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)\n", + "Collecting click-repl>=0.2.0 (from celery>=4.1.1->excalibur-py)\n", + " Downloading click_repl-0.2.0-py3-none-any.whl (5.2 kB)\n", + "Collecting kombu<6.0,>=5.3.0 (from celery>=4.1.1->excalibur-py)\n", + " Downloading kombu-5.3.0-py3-none-any.whl (198 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m198.5/198.5 kB\u001b[0m \u001b[31m27.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from celery>=4.1.1->excalibur-py) (2.8.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from celery>=4.1.1->excalibur-py) (2023.3)\n", + "Collecting vine<6.0,>=5.0.0 (from celery>=4.1.1->excalibur-py)\n", + " Downloading vine-5.0.0-py2.py3-none-any.whl (9.4 kB)\n", + "INFO: pip is looking at multiple versions of flask to determine which version is compatible with other requirements. This could take a while.\n", + "Collecting Flask>=1.0.2 (from excalibur-py)\n", + " Downloading Flask-2.3.2-py3-none-any.whl (96 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.9/96.9 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.3.1-py3-none-any.whl (96 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.0/97.0 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.3.0-py3-none-any.whl (96 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.0/97.0 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.2.5-py3-none-any.whl (101 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.2.3-py3-none-any.whl (101 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.2.2-py3-none-any.whl (101 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.5/101.5 kB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.2.1-py3-none-any.whl (101 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.5/101.5 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hINFO: pip is looking at multiple versions of flask to determine which version is compatible with other requirements. This could take a while.\n", + " Downloading Flask-2.2.0-py3-none-any.whl (101 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.1/101.1 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.1.3-py3-none-any.whl (95 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.6/95.6 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.1.2-py3-none-any.whl (95 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.2/95.2 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.1.1-py3-none-any.whl (95 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.2/95.2 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.1.0-py3-none-any.whl (95 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.2/95.2 kB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hINFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n", + " Downloading Flask-2.0.3-py3-none-any.whl (95 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.6/95.6 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.0.2-py3-none-any.whl (95 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.2/95.2 kB\u001b[0m \u001b[31m14.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.0.1-py3-none-any.whl (94 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-2.0.0-py3-none-any.whl (93 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.2/93.2 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-1.1.4-py2.py3-none-any.whl (94 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.6/94.6 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting Jinja2<3.0,>=2.10.1 (from Flask>=1.0.2->excalibur-py)\n", + " Downloading Jinja2-2.11.3-py2.py3-none-any.whl (125 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m125.7/125.7 kB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting itsdangerous<2.0,>=0.24 (from Flask>=1.0.2->excalibur-py)\n", + " Downloading itsdangerous-1.1.0-py2.py3-none-any.whl (16 kB)\n", + "Collecting Flask>=1.0.2 (from excalibur-py)\n", + " Downloading Flask-1.1.3-py2.py3-none-any.whl (94 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.6/94.6 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Downloading Flask-1.1.2-py2.py3-none-any.whl (94 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.6/94.6 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: Jinja2>=2.10.1 in /usr/local/lib/python3.10/dist-packages (from Flask>=1.0.2->excalibur-py) (3.1.2)\n", + "Requirement already satisfied: itsdangerous>=0.24 in /usr/local/lib/python3.10/dist-packages (from Flask>=1.0.2->excalibur-py) (2.1.2)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy>=1.2.12->excalibur-py) (4.5.0)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy>=1.2.12->excalibur-py) (2.0.2)\n", + "Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.10/dist-packages (from click-repl>=0.2.0->celery>=4.1.1->excalibur-py) (3.0.38)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from click-repl>=0.2.0->celery>=4.1.1->excalibur-py) (1.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from Jinja2>=2.10.1->Flask>=1.0.2->excalibur-py) (2.1.2)\n", + "Collecting amqp<6.0.0,>=5.1.1 (from kombu<6.0,>=5.3.0->celery>=4.1.1->excalibur-py)\n", + " Downloading amqp-5.1.1-py3-none-any.whl (50 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: et-xmlfile in /usr/local/lib/python3.10/dist-packages (from openpyxl>=2.5.8->camelot-py[cv]>=0.7.1->excalibur-py) (1.1.0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.23.4->camelot-py[cv]>=0.7.1->excalibur-py) (2022.7.1)\n", + "Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six>=20200726->camelot-py[cv]>=0.7.1->excalibur-py) (2.0.12)\n", + "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six>=20200726->camelot-py[cv]>=0.7.1->excalibur-py) (40.0.2)\n", + "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=36.0.0->pdfminer.six>=20200726->camelot-py[cv]>=0.7.1->excalibur-py) (1.15.1)\n", + "Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit->click-repl>=0.2.0->celery>=4.1.1->excalibur-py) (0.2.6)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six>=20200726->camelot-py[cv]>=0.7.1->excalibur-py) (2.21)\n", + "Installing collected packages: Werkzeug, vine, configparser, click-plugins, click-didyoumean, billiard, Flask, click-repl, amqp, kombu, celery, excalibur-py\n", + " Attempting uninstall: Werkzeug\n", + " Found existing installation: Werkzeug 2.3.0\n", + " Uninstalling Werkzeug-2.3.0:\n", + " Successfully uninstalled Werkzeug-2.3.0\n", + " Attempting uninstall: Flask\n", + " Found existing installation: Flask 2.2.4\n", + " Uninstalling Flask-2.2.4:\n", + " Successfully uninstalled Flask-2.2.4\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorboard 2.12.2 requires werkzeug>=1.0.1, but you have werkzeug 0.16.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed Flask-1.1.2 Werkzeug-0.16.1 amqp-5.1.1 billiard-4.1.0 celery-5.3.0 click-didyoumean-0.3.0 click-plugins-1.1.1 click-repl-0.2.0 configparser-3.5.3 excalibur-py-0.4.3 kombu-5.3.0 vine-5.0.0\n", + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "The following additional packages will be installed:\n", + " blt fonts-droid-fallback fonts-noto-mono fonts-urw-base35 libgs9\n", + " libgs9-common libidn11 libijs-0.35 libjbig2dec0 poppler-data tk8.6-blt2.5\n", + "Suggested packages:\n", + " blt-demo fonts-noto fonts-freefont-otf | fonts-freefont-ttf fonts-texgyre\n", + " ghostscript-x poppler-utils fonts-japanese-mincho | fonts-ipafont-mincho\n", + " fonts-japanese-gothic | fonts-ipafont-gothic fonts-arphic-ukai\n", + " fonts-arphic-uming fonts-nanum tix python3-tk-dbg\n", + "The following NEW packages will be installed:\n", + " blt fonts-droid-fallback fonts-noto-mono fonts-urw-base35 ghostscript libgs9\n", + " libgs9-common libidn11 libijs-0.35 libjbig2dec0 poppler-data python3-tk\n", + " tk8.6-blt2.5\n", + "0 upgraded, 13 newly installed, 0 to remove and 38 not upgraded.\n", + "Need to get 13.4 MB of archives.\n", + "After this operation, 54.8 MB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu focal/main amd64 fonts-droid-fallback all 1:6.0.1r16-1.1 [1,805 kB]\n", + "Get:2 http://archive.ubuntu.com/ubuntu focal/main amd64 poppler-data all 0.4.9-2 [1,475 kB]\n", + "Get:3 http://archive.ubuntu.com/ubuntu focal/main amd64 tk8.6-blt2.5 amd64 2.5.3+dfsg-4 [572 kB]\n", + "Get:4 http://archive.ubuntu.com/ubuntu focal/main amd64 blt amd64 2.5.3+dfsg-4 [4,944 B]\n", + "Get:5 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 fonts-noto-mono all 20200323-1build1~ubuntu20.04.1 [80.6 kB]\n", + "Get:6 http://archive.ubuntu.com/ubuntu focal/main amd64 fonts-urw-base35 all 20170801.1-3 [6,333 kB]\n", + "Get:7 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 libgs9-common all 9.50~dfsg-5ubuntu4.7 [681 kB]\n", + "Get:8 http://archive.ubuntu.com/ubuntu focal/main amd64 libidn11 amd64 1.33-2.2ubuntu2 [46.2 kB]\n", + "Get:9 http://archive.ubuntu.com/ubuntu focal/main amd64 libijs-0.35 amd64 0.35-15 [15.7 kB]\n", + "Get:10 http://archive.ubuntu.com/ubuntu focal/main amd64 libjbig2dec0 amd64 0.18-1ubuntu1 [60.0 kB]\n", + "Get:11 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 libgs9 amd64 9.50~dfsg-5ubuntu4.7 [2,173 kB]\n", + "Get:12 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 ghostscript amd64 9.50~dfsg-5ubuntu4.7 [51.9 kB]\n", + "Get:13 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 python3-tk amd64 3.8.10-0ubuntu1~20.04 [104 kB]\n", + "Fetched 13.4 MB in 0s (37.4 MB/s)\n", + "Selecting previously unselected package fonts-droid-fallback.\n", + "(Reading database ... 122541 files and directories currently installed.)\n", + "Preparing to unpack .../00-fonts-droid-fallback_1%3a6.0.1r16-1.1_all.deb ...\n", + "Unpacking fonts-droid-fallback (1:6.0.1r16-1.1) ...\n", + "Selecting previously unselected package poppler-data.\n", + "Preparing to unpack .../01-poppler-data_0.4.9-2_all.deb ...\n", + "Unpacking poppler-data (0.4.9-2) ...\n", + "Selecting previously unselected package tk8.6-blt2.5.\n", + "Preparing to unpack .../02-tk8.6-blt2.5_2.5.3+dfsg-4_amd64.deb ...\n", + "Unpacking tk8.6-blt2.5 (2.5.3+dfsg-4) ...\n", + "Selecting previously unselected package blt.\n", + "Preparing to unpack .../03-blt_2.5.3+dfsg-4_amd64.deb ...\n", + "Unpacking blt (2.5.3+dfsg-4) ...\n", + "Selecting previously unselected package fonts-noto-mono.\n", + "Preparing to unpack .../04-fonts-noto-mono_20200323-1build1~ubuntu20.04.1_all.deb ...\n", + "Unpacking fonts-noto-mono (20200323-1build1~ubuntu20.04.1) ...\n", + "Selecting previously unselected package fonts-urw-base35.\n", + "Preparing to unpack .../05-fonts-urw-base35_20170801.1-3_all.deb ...\n", + "Unpacking fonts-urw-base35 (20170801.1-3) ...\n", + "Selecting previously unselected package libgs9-common.\n", + "Preparing to unpack .../06-libgs9-common_9.50~dfsg-5ubuntu4.7_all.deb ...\n", + "Unpacking libgs9-common (9.50~dfsg-5ubuntu4.7) ...\n", + "Selecting previously unselected package libidn11:amd64.\n", + "Preparing to unpack .../07-libidn11_1.33-2.2ubuntu2_amd64.deb ...\n", + "Unpacking libidn11:amd64 (1.33-2.2ubuntu2) ...\n", + "Selecting previously unselected package libijs-0.35:amd64.\n", + "Preparing to unpack .../08-libijs-0.35_0.35-15_amd64.deb ...\n", + "Unpacking libijs-0.35:amd64 (0.35-15) ...\n", + "Selecting previously unselected package libjbig2dec0:amd64.\n", + "Preparing to unpack .../09-libjbig2dec0_0.18-1ubuntu1_amd64.deb ...\n", + "Unpacking libjbig2dec0:amd64 (0.18-1ubuntu1) ...\n", + "Selecting previously unselected package libgs9:amd64.\n", + "Preparing to unpack .../10-libgs9_9.50~dfsg-5ubuntu4.7_amd64.deb ...\n", + "Unpacking libgs9:amd64 (9.50~dfsg-5ubuntu4.7) ...\n", + "Selecting previously unselected package ghostscript.\n", + "Preparing to unpack .../11-ghostscript_9.50~dfsg-5ubuntu4.7_amd64.deb ...\n", + "Unpacking ghostscript (9.50~dfsg-5ubuntu4.7) ...\n", + "Selecting previously unselected package python3-tk:amd64.\n", + "Preparing to unpack .../12-python3-tk_3.8.10-0ubuntu1~20.04_amd64.deb ...\n", + "Unpacking python3-tk:amd64 (3.8.10-0ubuntu1~20.04) ...\n", + "Setting up tk8.6-blt2.5 (2.5.3+dfsg-4) ...\n", + "Setting up fonts-noto-mono (20200323-1build1~ubuntu20.04.1) ...\n", + "Setting up libijs-0.35:amd64 (0.35-15) ...\n", + "Setting up blt (2.5.3+dfsg-4) ...\n", + "Setting up python3-tk:amd64 (3.8.10-0ubuntu1~20.04) ...\n", + "Setting up fonts-urw-base35 (20170801.1-3) ...\n", + "Setting up poppler-data (0.4.9-2) ...\n", + "Setting up libjbig2dec0:amd64 (0.18-1ubuntu1) ...\n", + "Setting up libidn11:amd64 (1.33-2.2ubuntu2) ...\n", + "Setting up fonts-droid-fallback (1:6.0.1r16-1.1) ...\n", + "Setting up libgs9-common (9.50~dfsg-5ubuntu4.7) ...\n", + "Setting up libgs9:amd64 (9.50~dfsg-5ubuntu4.7) ...\n", + "Setting up ghostscript (9.50~dfsg-5ubuntu4.7) ...\n", + "Processing triggers for fontconfig (2.13.1-2ubuntu3) ...\n", + "Processing triggers for libc-bin (2.31-0ubuntu9.9) ...\n", + "Processing triggers for man-db (2.9.1-1) ...\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "m07h1PVqi5o7", + "outputId": "46571940-7515-4734-c483-7816759b8f78" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting PyPDF2==2.12.1\n", + " Downloading pypdf2-2.12.1-py3-none-any.whl (222 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.8/222.8 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: PyPDF2\n", + "Successfully installed PyPDF2-2.12.1\n" + ] + } + ], + "source": [ + "!pip install --upgrade PyPDF2==2.12.1\n" + ] + }, + { + "cell_type": "code", + "source": [ + "import camelot as cm # libreria de camelot\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pandas import Series, DataFrame\n", + "import os\n", + "from PyPDF2 import PdfFileReader\n", + "import re # expresiones regulares" + ], + "metadata": { + "id": "gqdYIizhi_lx" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "tables = cm.read_pdf(\"/content/sample_data/G3_JUNIN.pdf\")\n" + ], + "metadata": { + "id": "K6YPDnp_jdnk" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "tables" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SOt__8yzj4h8", + "outputId": "7d68f20f-f9d7-4888-c2b2-37893527fba7" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "dir(tables\n", + " )" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Jlkpc5OmjoQh", + "outputId": "3c7e721e-3e41-4f79-81e4-6c6bc7efa6f5" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['__class__',\n", + " '__delattr__',\n", + " '__dict__',\n", + " '__dir__',\n", + " '__doc__',\n", + " '__eq__',\n", + " '__format__',\n", + " '__ge__',\n", + " '__getattribute__',\n", + " '__getitem__',\n", + " '__gt__',\n", + " '__hash__',\n", + " '__init__',\n", + " '__init_subclass__',\n", + " '__le__',\n", + " '__len__',\n", + " '__lt__',\n", + " '__module__',\n", + " '__ne__',\n", + " '__new__',\n", + " '__reduce__',\n", + " '__reduce_ex__',\n", + " '__repr__',\n", + " '__setattr__',\n", + " '__sizeof__',\n", + " '__str__',\n", + " '__subclasshook__',\n", + " '__weakref__',\n", + " '_compress_dir',\n", + " '_format_func',\n", + " '_tables',\n", + " '_write_file',\n", + " 'export',\n", + " 'n']" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "dir(tables[0])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ad5c0C74kITO", + "outputId": "698760b7-574c-4e3a-9678-99f45898a56b" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['__class__',\n", + " '__delattr__',\n", + " '__dict__',\n", + " '__dir__',\n", + " '__doc__',\n", + " '__eq__',\n", + " '__format__',\n", + " '__ge__',\n", + " '__getattribute__',\n", + " '__gt__',\n", + " '__hash__',\n", + " '__init__',\n", + " '__init_subclass__',\n", + " '__le__',\n", + " '__lt__',\n", + " '__module__',\n", + " '__ne__',\n", + " '__new__',\n", + " '__reduce__',\n", + " '__reduce_ex__',\n", + " '__repr__',\n", + " '__setattr__',\n", + " '__sizeof__',\n", + " '__str__',\n", + " '__subclasshook__',\n", + " '__weakref__',\n", + " '_bbox',\n", + " '_image',\n", + " '_segments',\n", + " '_text',\n", + " '_textedges',\n", + " 'accuracy',\n", + " 'cells',\n", + " 'cols',\n", + " 'data',\n", + " 'df',\n", + " 'flavor',\n", + " 'order',\n", + " 'page',\n", + " 'parsing_report',\n", + " 'rows',\n", + " 'set_all_edges',\n", + " 'set_border',\n", + " 'set_edges',\n", + " 'set_span',\n", + " 'shape',\n", + " 'to_csv',\n", + " 'to_excel',\n", + " 'to_html',\n", + " 'to_json',\n", + " 'to_sqlite',\n", + " 'whitespace']" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "tables[0].cells " + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5IAXhe2OkgAV", + "outputId": "1e02f6df-776c-47c1-e5f5-9f4d4b2e8ede" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]]" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [ + "tables[0].data # entrega cada fila de la tabla en formato lista " + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rhjmfd4tkmgc", + "outputId": "a30d8fc2-cebb-4bb0-ca23-31c457259ecb" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['#',\n", + " 'Año JEC',\n", + " 'Código \\nmodular',\n", + " 'Código \\nlocal',\n", + " 'Nombre de la IE',\n", + " 'DRE',\n", + " 'UGEL',\n", + " 'Ubigeo',\n", + " 'Departamento',\n", + " 'Provincia',\n", + " 'Distrito',\n", + " 'Dirección',\n", + " 'Área Geográfica'],\n", + " ['1073',\n", + " '2015',\n", + " '1214410',\n", + " '234382',\n", + " 'SANTA ROSA',\n", + " 'DRE JUNIN',\n", + " 'UGEL \\nCHANCHAMAYO',\n", + " '120305',\n", + " 'JUNIN',\n", + " 'CHANCHAMAYO',\n", + " 'SAN RAMON',\n", + " 'AVENIDA 06 DE AGOSTO S/N',\n", + " 'URBANA'],\n", + " ['1078',\n", + " '2015',\n", + " '0692814',\n", + " '230077',\n", + " 'MARIA AUXILIADORA',\n", + " 'DRE JUNIN',\n", + " 'UGEL \\nCHANCHAMAYO',\n", + " '120301',\n", + " 'JUNIN',\n", + " 'CHANCHAMAYO',\n", + " 'CHANCHAMAYO',\n", + " 'CALLE SAN CARLOS S/N',\n", + " 'URBANA'],\n", + " ['1080',\n", + " '2015',\n", + " '0738799',\n", + " '232509',\n", + " 'PUCHARINI',\n", + " 'DRE JUNIN',\n", + " 'UGEL \\nCHANCHAMAYO',\n", + " '120302',\n", + " 'JUNIN',\n", + " 'CHANCHAMAYO',\n", + " 'PERENE',\n", + " 'CARRETERA MARGINAL KM \\n53 S/N',\n", + " 'URBANA'],\n", + " ['1081',\n", + " '2015',\n", + " '0599175',\n", + " '230058',\n", + " 'NUESTRA SEÑORA DE LAS \\nMERCEDES',\n", + " 'DRE JUNIN',\n", + " 'UGEL \\nCHANCHAMAYO',\n", + " '120301',\n", + " 'JUNIN',\n", + " 'CHANCHAMAYO',\n", + " 'CHANCHAMAYO',\n", + " 'JIRON JOSE DE SAN MARTIN \\nS/N',\n", + " 'URBANA'],\n", + " ['1082',\n", + " '2015',\n", + " '0373290',\n", + " '230906',\n", + " 'Perené',\n", + " 'DRE JUNIN',\n", + " 'UGEL \\nCHANCHAMAYO',\n", + " '120302',\n", + " 'JUNIN',\n", + " 'CHANCHAMAYO',\n", + " 'PERENÉ',\n", + " 'JIRON LOS CAFETOS PAMPA \\nSILVA S/N',\n", + " 'URBANA'],\n", + " ['1085',\n", + " '2015',\n", + " '0525428',\n", + " '229917',\n", + " 'AUGUSTO SALAZAR BONDY',\n", + " 'DRE JUNIN',\n", + " 'UGEL CHUPACA',\n", + " '120214',\n", + " 'JUNIN',\n", + " 'CONCEPCION',\n", + " 'SAN JOSE DE \\nQUERO',\n", + " 'AVENIDA CONCEPCION S/N',\n", + " 'URBANA'],\n", + " ['1086',\n", + " '2015',\n", + " '0590919',\n", + " '249109',\n", + " 'HEROES DE LA BREÑA',\n", + " 'DRE JUNIN',\n", + " 'UGEL CHUPACA',\n", + " '120905',\n", + " 'JUNIN',\n", + " 'CHUPACA',\n", + " 'HUAMANCACA \\nCHICO',\n", + " 'AVENIDA MARIA PARADO DE \\nBELLIDO S/N',\n", + " 'URBANA'],\n", + " ['1087',\n", + " '2015',\n", + " '0372912',\n", + " '248850',\n", + " 'AMAUTA',\n", + " 'DRE JUNIN',\n", + " 'UGEL CHUPACA',\n", + " '120902',\n", + " 'JUNIN',\n", + " 'CHUPACA',\n", + " 'AHUAC',\n", + " 'AVENIDA MARISCAL CACERES \\n1007',\n", + " 'URBANA'],\n", + " ['1088',\n", + " '2015',\n", + " '0372961',\n", + " '248930',\n", + " 'SANTIAGO LEON',\n", + " 'DRE JUNIN',\n", + " 'UGEL CHUPACA',\n", + " '120903',\n", + " 'JUNIN',\n", + " 'CHUPACA',\n", + " 'CHONGOS BAJO',\n", + " 'CALLE LA MAR S/N',\n", + " 'RURAL'],\n", + " ['1090',\n", + " '2015',\n", + " '0580290',\n", + " '249492',\n", + " 'CAHUIDE',\n", + " 'DRE JUNIN',\n", + " 'UGEL CHUPACA',\n", + " '120909',\n", + " 'JUNIN',\n", + " 'CHUPACA',\n", + " 'YANACANCHA',\n", + " 'CARRETERA PRINCIPAL LAIVE \\nS/N',\n", + " 'URBANA'],\n", + " ['1093',\n", + " '2015',\n", + " '1099852',\n", + " '227555',\n", + " '31511 LORENZO ALCALA POMALAZA',\n", + " 'DRE JUNIN',\n", + " 'UGEL CONCEPCION',\n", + " '120201',\n", + " 'JUNIN',\n", + " 'CONCEPCION',\n", + " 'CONCEPCION',\n", + " 'AVENIDA AGRICULTURA 542',\n", + " 'URBANA'],\n", + " ['1094',\n", + " '2015',\n", + " '0372730',\n", + " '227490',\n", + " 'HEROINAS TOLEDO',\n", + " 'DRE JUNIN',\n", + " 'UGEL CONCEPCION',\n", + " '120201',\n", + " 'JUNIN',\n", + " 'CONCEPCION',\n", + " 'CONCEPCION',\n", + " 'AVENIDA AGRICULTURA 596-\\n598',\n", + " 'URBANA'],\n", + " ['1095',\n", + " '2015',\n", + " '0373027',\n", + " '229012',\n", + " 'APU INCA',\n", + " 'DRE JUNIN',\n", + " 'UGEL CONCEPCION',\n", + " '120206',\n", + " 'JUNIN',\n", + " 'CONCEPCION',\n", + " 'COMAS',\n", + " 'CALLE ESTADIO MUNICIPAL \\nS/N',\n", + " 'URBANA'],\n", + " ['1100',\n", + " '2015',\n", + " '0372946',\n", + " '224608',\n", + " 'JOSE OLAYA',\n", + " 'DRE JUNIN',\n", + " 'UGEL HUANCAYO',\n", + " '120117',\n", + " 'JUNIN',\n", + " 'HUANCAYO',\n", + " 'HUALHUAS',\n", + " 'AVENIDA ALFONSO UGARTE \\n944-970',\n", + " 'URBANA'],\n", + " ['1101',\n", + " '2015',\n", + " '0667022',\n", + " '224383',\n", + " 'SALESIANO DON BOSCO',\n", + " 'DRE JUNIN',\n", + " 'UGEL HUANCAYO',\n", + " '120114',\n", + " 'JUNIN',\n", + " 'HUANCAYO',\n", + " 'EL TAMBO',\n", + " 'AVENIDA HUANCAVELICA \\n165',\n", + " 'URBANA'],\n", + " ['1103',\n", + " '2015',\n", + " '0372870',\n", + " '226551',\n", + " 'CHINCHAYSUYO',\n", + " 'DRE JUNIN',\n", + " 'UGEL HUANCAYO',\n", + " '120133',\n", + " 'JUNIN',\n", + " 'HUANCAYO',\n", + " 'SAPALLANGA',\n", + " 'AVENIDA PEÐALOZA S/N',\n", + " 'RURAL']]" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "source": [ + "data = pd.concat(tables, ignore_index=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + }, + "id": "qzjescS-kUhJ", + "outputId": "c4598f0d-b235-4e7b-c90d-25c50cac0e5f" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "error", + "ename": "TypeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtables\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_stack_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 332\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;31m# error: \"Callable[[VarArg(Any), KwArg(Any)], Any]\" has no\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/reshape/concat.py\u001b[0m in \u001b[0;36mconcat\u001b[0;34m(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;36m3\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \"\"\"\n\u001b[0;32m--> 368\u001b[0;31m op = _Concatenator(\n\u001b[0m\u001b[1;32m 369\u001b[0m \u001b[0mobjs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 370\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/reshape/concat.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)\u001b[0m\n\u001b[1;32m 456\u001b[0m \u001b[0;34m\"only Series and DataFrame objs are valid\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 457\u001b[0m )\n\u001b[0;32m--> 458\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 459\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 460\u001b[0m \u001b[0mndims\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: cannot concatenate object of type ''; only Series and DataFrame objs are valid" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Crea un solo DataFrame combinando todas las tablas\n", + "data_frames = [table.df for table in tables]\n", + "data = pd.concat(data_frames, ignore_index=True)" + ], + "metadata": { + "id": "JjFfbw1LmIKT" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(data)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5BuTw6kxmKMy", + "outputId": "3d5cf343-29dd-464d-d5c9-552d44e166b9" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " 0 1 2 3 \\\n", + "0 # Año JEC Código \\nmodular Código \\nlocal \n", + "1 1073 2015 1214410 234382 \n", + "2 1078 2015 0692814 230077 \n", + "3 1080 2015 0738799 232509 \n", + "4 1081 2015 0599175 230058 \n", + "5 1082 2015 0373290 230906 \n", + "6 1085 2015 0525428 229917 \n", + "7 1086 2015 0590919 249109 \n", + "8 1087 2015 0372912 248850 \n", + "9 1088 2015 0372961 248930 \n", + "10 1090 2015 0580290 249492 \n", + "11 1093 2015 1099852 227555 \n", + "12 1094 2015 0372730 227490 \n", + "13 1095 2015 0373027 229012 \n", + "14 1100 2015 0372946 224608 \n", + "15 1101 2015 0667022 224383 \n", + "16 1103 2015 0372870 226551 \n", + "\n", + " 4 5 6 7 \\\n", + "0 Nombre de la IE DRE UGEL Ubigeo \n", + "1 SANTA ROSA DRE JUNIN UGEL \\nCHANCHAMAYO 120305 \n", + "2 MARIA AUXILIADORA DRE JUNIN UGEL \\nCHANCHAMAYO 120301 \n", + "3 PUCHARINI DRE JUNIN UGEL \\nCHANCHAMAYO 120302 \n", + "4 NUESTRA SEÑORA DE LAS \\nMERCEDES DRE JUNIN UGEL \\nCHANCHAMAYO 120301 \n", + "5 Perené DRE JUNIN UGEL \\nCHANCHAMAYO 120302 \n", + "6 AUGUSTO SALAZAR BONDY DRE JUNIN UGEL CHUPACA 120214 \n", + "7 HEROES DE LA BREÑA DRE JUNIN UGEL CHUPACA 120905 \n", + "8 AMAUTA DRE JUNIN UGEL CHUPACA 120902 \n", + "9 SANTIAGO LEON DRE JUNIN UGEL CHUPACA 120903 \n", + "10 CAHUIDE DRE JUNIN UGEL CHUPACA 120909 \n", + "11 31511 LORENZO ALCALA POMALAZA DRE JUNIN UGEL CONCEPCION 120201 \n", + "12 HEROINAS TOLEDO DRE JUNIN UGEL CONCEPCION 120201 \n", + "13 APU INCA DRE JUNIN UGEL CONCEPCION 120206 \n", + "14 JOSE OLAYA DRE JUNIN UGEL HUANCAYO 120117 \n", + "15 SALESIANO DON BOSCO DRE JUNIN UGEL HUANCAYO 120114 \n", + "16 CHINCHAYSUYO DRE JUNIN UGEL HUANCAYO 120133 \n", + "\n", + " 8 9 10 \\\n", + "0 Departamento Provincia Distrito \n", + "1 JUNIN CHANCHAMAYO SAN RAMON \n", + "2 JUNIN CHANCHAMAYO CHANCHAMAYO \n", + "3 JUNIN CHANCHAMAYO PERENE \n", + "4 JUNIN CHANCHAMAYO CHANCHAMAYO \n", + "5 JUNIN CHANCHAMAYO PERENÉ \n", + "6 JUNIN CONCEPCION SAN JOSE DE \\nQUERO \n", + "7 JUNIN CHUPACA HUAMANCACA \\nCHICO \n", + "8 JUNIN CHUPACA AHUAC \n", + "9 JUNIN CHUPACA CHONGOS BAJO \n", + "10 JUNIN CHUPACA YANACANCHA \n", + "11 JUNIN CONCEPCION CONCEPCION \n", + "12 JUNIN CONCEPCION CONCEPCION \n", + "13 JUNIN CONCEPCION COMAS \n", + "14 JUNIN HUANCAYO HUALHUAS \n", + "15 JUNIN HUANCAYO EL TAMBO \n", + "16 JUNIN HUANCAYO SAPALLANGA \n", + "\n", + " 11 12 \n", + "0 Dirección Área Geográfica \n", + "1 AVENIDA 06 DE AGOSTO S/N URBANA \n", + "2 CALLE SAN CARLOS S/N URBANA \n", + "3 CARRETERA MARGINAL KM \\n53 S/N URBANA \n", + "4 JIRON JOSE DE SAN MARTIN \\nS/N URBANA \n", + "5 JIRON LOS CAFETOS PAMPA \\nSILVA S/N URBANA \n", + "6 AVENIDA CONCEPCION S/N URBANA \n", + "7 AVENIDA MARIA PARADO DE \\nBELLIDO S/N URBANA \n", + "8 AVENIDA MARISCAL CACERES \\n1007 URBANA \n", + "9 CALLE LA MAR S/N RURAL \n", + "10 CARRETERA PRINCIPAL LAIVE \\nS/N URBANA \n", + "11 AVENIDA AGRICULTURA 542 URBANA \n", + "12 AVENIDA AGRICULTURA 596-\\n598 URBANA \n", + "13 CALLE ESTADIO MUNICIPAL \\nS/N URBANA \n", + "14 AVENIDA ALFONSO UGARTE \\n944-970 URBANA \n", + "15 AVENIDA HUANCAVELICA \\n165 URBANA \n", + "16 AVENIDA PEÐALOZA S/N RURAL \n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "data" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 582 + }, + "id": "xzaSSCXcmNTC", + "outputId": "907cbbd5-e99f-4805-ac24-9af0bac9cee7" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0 1 2 3 \\\n", + "0 # Año JEC Código \\nmodular Código \\nlocal \n", + "1 1073 2015 1214410 234382 \n", + "2 1078 2015 0692814 230077 \n", + "3 1080 2015 0738799 232509 \n", + "4 1081 2015 0599175 230058 \n", + "5 1082 2015 0373290 230906 \n", + "6 1085 2015 0525428 229917 \n", + "7 1086 2015 0590919 249109 \n", + "8 1087 2015 0372912 248850 \n", + "9 1088 2015 0372961 248930 \n", + "10 1090 2015 0580290 249492 \n", + "11 1093 2015 1099852 227555 \n", + "12 1094 2015 0372730 227490 \n", + "13 1095 2015 0373027 229012 \n", + "14 1100 2015 0372946 224608 \n", + "15 1101 2015 0667022 224383 \n", + "16 1103 2015 0372870 226551 \n", + "\n", + " 4 5 6 7 \\\n", + "0 Nombre de la IE DRE UGEL Ubigeo \n", + "1 SANTA ROSA DRE JUNIN UGEL \\nCHANCHAMAYO 120305 \n", + "2 MARIA AUXILIADORA DRE JUNIN UGEL \\nCHANCHAMAYO 120301 \n", + "3 PUCHARINI DRE JUNIN UGEL \\nCHANCHAMAYO 120302 \n", + "4 NUESTRA SEÑORA DE LAS \\nMERCEDES DRE JUNIN UGEL \\nCHANCHAMAYO 120301 \n", + "5 Perené DRE JUNIN UGEL \\nCHANCHAMAYO 120302 \n", + "6 AUGUSTO SALAZAR BONDY DRE JUNIN UGEL CHUPACA 120214 \n", + "7 HEROES DE LA BREÑA DRE JUNIN UGEL CHUPACA 120905 \n", + "8 AMAUTA DRE JUNIN UGEL CHUPACA 120902 \n", + "9 SANTIAGO LEON DRE JUNIN UGEL CHUPACA 120903 \n", + "10 CAHUIDE DRE JUNIN UGEL CHUPACA 120909 \n", + "11 31511 LORENZO ALCALA POMALAZA DRE JUNIN UGEL CONCEPCION 120201 \n", + "12 HEROINAS TOLEDO DRE JUNIN UGEL CONCEPCION 120201 \n", + "13 APU INCA DRE JUNIN UGEL CONCEPCION 120206 \n", + "14 JOSE OLAYA DRE JUNIN UGEL HUANCAYO 120117 \n", + "15 SALESIANO DON BOSCO DRE JUNIN UGEL HUANCAYO 120114 \n", + "16 CHINCHAYSUYO DRE JUNIN UGEL HUANCAYO 120133 \n", + "\n", + " 8 9 10 \\\n", + "0 Departamento Provincia Distrito \n", + "1 JUNIN CHANCHAMAYO SAN RAMON \n", + "2 JUNIN CHANCHAMAYO CHANCHAMAYO \n", + "3 JUNIN CHANCHAMAYO PERENE \n", + "4 JUNIN CHANCHAMAYO CHANCHAMAYO \n", + "5 JUNIN CHANCHAMAYO PERENÉ \n", + "6 JUNIN CONCEPCION SAN JOSE DE \\nQUERO \n", + "7 JUNIN CHUPACA HUAMANCACA \\nCHICO \n", + "8 JUNIN CHUPACA AHUAC \n", + "9 JUNIN CHUPACA CHONGOS BAJO \n", + "10 JUNIN CHUPACA YANACANCHA \n", + "11 JUNIN CONCEPCION CONCEPCION \n", + "12 JUNIN CONCEPCION CONCEPCION \n", + "13 JUNIN CONCEPCION COMAS \n", + "14 JUNIN HUANCAYO HUALHUAS \n", + "15 JUNIN HUANCAYO EL TAMBO \n", + "16 JUNIN HUANCAYO SAPALLANGA \n", + "\n", + " 11 12 \n", + "0 Dirección Área Geográfica \n", + "1 AVENIDA 06 DE AGOSTO S/N URBANA \n", + "2 CALLE SAN CARLOS S/N URBANA \n", + "3 CARRETERA MARGINAL KM \\n53 S/N URBANA \n", + "4 JIRON JOSE DE SAN MARTIN \\nS/N URBANA \n", + "5 JIRON LOS CAFETOS PAMPA \\nSILVA S/N URBANA \n", + "6 AVENIDA CONCEPCION S/N URBANA \n", + "7 AVENIDA MARIA PARADO DE \\nBELLIDO S/N URBANA \n", + "8 AVENIDA MARISCAL CACERES \\n1007 URBANA \n", + "9 CALLE LA MAR S/N RURAL \n", + "10 CARRETERA PRINCIPAL LAIVE \\nS/N URBANA \n", + "11 AVENIDA AGRICULTURA 542 URBANA \n", + "12 AVENIDA AGRICULTURA 596-\\n598 URBANA \n", + "13 CALLE ESTADIO MUNICIPAL \\nS/N URBANA \n", + "14 AVENIDA ALFONSO UGARTE \\n944-970 URBANA \n", + "15 AVENIDA HUANCAVELICA \\n165 URBANA \n", + "16 AVENIDA PEÐALOZA S/N RURAL " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789101112
0#Año JECCódigo \\nmodularCódigo \\nlocalNombre de la IEDREUGELUbigeoDepartamentoProvinciaDistritoDirecciónÁrea Geográfica
1107320151214410234382SANTA ROSADRE JUNINUGEL \\nCHANCHAMAYO120305JUNINCHANCHAMAYOSAN RAMONAVENIDA 06 DE AGOSTO S/NURBANA
2107820150692814230077MARIA AUXILIADORADRE JUNINUGEL \\nCHANCHAMAYO120301JUNINCHANCHAMAYOCHANCHAMAYOCALLE SAN CARLOS S/NURBANA
3108020150738799232509PUCHARINIDRE JUNINUGEL \\nCHANCHAMAYO120302JUNINCHANCHAMAYOPERENECARRETERA MARGINAL KM \\n53 S/NURBANA
4108120150599175230058NUESTRA SEÑORA DE LAS \\nMERCEDESDRE JUNINUGEL \\nCHANCHAMAYO120301JUNINCHANCHAMAYOCHANCHAMAYOJIRON JOSE DE SAN MARTIN \\nS/NURBANA
5108220150373290230906PerenéDRE JUNINUGEL \\nCHANCHAMAYO120302JUNINCHANCHAMAYOPERENÉJIRON LOS CAFETOS PAMPA \\nSILVA S/NURBANA
6108520150525428229917AUGUSTO SALAZAR BONDYDRE JUNINUGEL CHUPACA120214JUNINCONCEPCIONSAN JOSE DE \\nQUEROAVENIDA CONCEPCION S/NURBANA
7108620150590919249109HEROES DE LA BREÑADRE JUNINUGEL CHUPACA120905JUNINCHUPACAHUAMANCACA \\nCHICOAVENIDA MARIA PARADO DE \\nBELLIDO S/NURBANA
8108720150372912248850AMAUTADRE JUNINUGEL CHUPACA120902JUNINCHUPACAAHUACAVENIDA MARISCAL CACERES \\n1007URBANA
9108820150372961248930SANTIAGO LEONDRE JUNINUGEL CHUPACA120903JUNINCHUPACACHONGOS BAJOCALLE LA MAR S/NRURAL
10109020150580290249492CAHUIDEDRE JUNINUGEL CHUPACA120909JUNINCHUPACAYANACANCHACARRETERA PRINCIPAL LAIVE \\nS/NURBANA
1110932015109985222755531511 LORENZO ALCALA POMALAZADRE JUNINUGEL CONCEPCION120201JUNINCONCEPCIONCONCEPCIONAVENIDA AGRICULTURA 542URBANA
12109420150372730227490HEROINAS TOLEDODRE JUNINUGEL CONCEPCION120201JUNINCONCEPCIONCONCEPCIONAVENIDA AGRICULTURA 596-\\n598URBANA
13109520150373027229012APU INCADRE JUNINUGEL CONCEPCION120206JUNINCONCEPCIONCOMASCALLE ESTADIO MUNICIPAL \\nS/NURBANA
14110020150372946224608JOSE OLAYADRE JUNINUGEL HUANCAYO120117JUNINHUANCAYOHUALHUASAVENIDA ALFONSO UGARTE \\n944-970URBANA
15110120150667022224383SALESIANO DON BOSCODRE JUNINUGEL HUANCAYO120114JUNINHUANCAYOEL TAMBOAVENIDA HUANCAVELICA \\n165URBANA
16110320150372870226551CHINCHAYSUYODRE JUNINUGEL HUANCAYO120133JUNINHUANCAYOSAPALLANGAAVENIDA PEÐALOZA S/NRURAL
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import files" + ], + "metadata": { + "id": "LH3goXVIn41V" + }, + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Exportar el DataFrame a formato .dta (Stata)\n", + "data.to_stata(\"jec_data_Grupo3.dta\")\n", + "\n", + "# Descargar el archivo .dta\n", + "files.download(\"jec_data_Grupo3.dta\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 416 + }, + "id": "eYFDa0A6mlJ9", + "outputId": "ab267151-21ea-4b98-fbfd-849e4cce5583" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":2: InvalidColumnName: \n", + "Not all pandas column names were valid Stata variable names.\n", + "The following replacements have been made:\n", + "\n", + " 0 -> _0\n", + " 1 -> _1\n", + " 2 -> _2\n", + " 3 -> _3\n", + " 4 -> _4\n", + " 5 -> _5\n", + " 6 -> _6\n", + " 7 -> _7\n", + " 8 -> _8\n", + " 9 -> _9\n", + " 10 -> _10\n", + " 11 -> _11\n", + " 12 -> _12\n", + "\n", + "If this is not what you expect, please make sure you have Stata-compliant\n", + "column names in your DataFrame (strings only, max 32 characters, only\n", + "alphanumerics and underscores, no Stata reserved words)\n", + "\n", + " data.to_stata(\"jec_data_Grupo3.dta\")\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " async function download(id, filename, size) {\n", + " if (!google.colab.kernel.accessAllowed) {\n", + " return;\n", + " }\n", + " const div = document.createElement('div');\n", + " const label = document.createElement('label');\n", + " label.textContent = `Downloading \"${filename}\": `;\n", + " div.appendChild(label);\n", + " const progress = document.createElement('progress');\n", + " progress.max = size;\n", + " div.appendChild(progress);\n", + " document.body.appendChild(div);\n", + "\n", + " const buffers = [];\n", + " let downloaded = 0;\n", + "\n", + " const channel = await google.colab.kernel.comms.open(id);\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + "\n", + " for await (const message of channel.messages) {\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + " if (message.buffers) {\n", + " for (const buffer of message.buffers) {\n", + " buffers.push(buffer);\n", + " downloaded += buffer.byteLength;\n", + " progress.value = downloaded;\n", + " }\n", + " }\n", + " }\n", + " const blob = new Blob(buffers, {type: 'application/binary'});\n", + " const a = document.createElement('a');\n", + " a.href = window.URL.createObjectURL(blob);\n", + " a.download = filename;\n", + " div.appendChild(a);\n", + " a.click();\n", + " div.remove();\n", + " }\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "download(\"download_74d964e6-d0d1-4840-9368-412f805a6513\", \"jec_data_Grupo3.dta\", 6268)" + ] + }, + "metadata": {} + } + ] + } + ] +} \ No newline at end of file diff --git a/Labs/tarea6/GRUPO_3_RShape.ipynb b/Labs/tarea6/GRUPO_3_RShape.ipynb new file mode 100644 index 0000000..c8e0d1e --- /dev/null +++ b/Labs/tarea6/GRUPO_3_RShape.ipynb @@ -0,0 +1,1157 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "87bb4297", + "metadata": {}, + "source": [ + "#### Tarea 6 - Grupo 3\n", + "#### Integrantes\n", + "* Narumi Miyamoto \n", + "* Rodrigo Cervera\n", + "* Alicia Chaquila" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4137dd69", + "metadata": {}, + "outputs": [], + "source": [ + "#Exportamos las librerías que necesitaremos\n", + "import pandas as pd\n", + "import numpy as np\n", + "import re \n", + "import warnings\n", + "warnings.filterwarnings('ignore') " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2ef139e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numpernumpanh15numpanh16numpanh17numpanh18numpanh19mes_15ubigeo_15dominio_15p400a3_15...mes_18ubigeo_18dominio_18p400a3_18p4022_18mes_19ubigeo_19dominio_19p400a3_19p4022_19
015368115811581158115811581110101sierra norte1946.0...1210101sierra norte1946.00.01110101sierra norte1946.00.0
115369115811581158115811581110101sierra norte1973.0...1210101sierra norte1973.00.01110101sierra norte1973.00.0
215380116211621162116211621110101sierra norte1987.0...1210101sierra norte1987.00.01110101sierra norte1987.01.0
315381116211621162116211621110101sierra norte2009.0...1210101sierra norte2009.00.01110101sierra norte2009.00.0
41541011851185118511851185710101sierra norte1955.0...710101sierra norte1955.00.0710101sierra norte1955.01.0
..................................................................
554134848185491854918549185491854912250101selva2002.0...12250101selva2002.00.010250101selva2002.01.0
554234846185491854918549185491854912250101selva1985.0...12250101selva1985.00.010250101selva1985.00.0
554334847185491854918549185491854912250101selva1976.0...12250101selva1976.00.010250101selva1976.01.0
554434849185491854918549185491854912250101selva2007.0...12250101selva2007.00.010250101selva2007.01.0
554534850185491854918549185491854912250101selva2011.0...12250101selva2011.01.010250101selva2011.00.0
\n", + "

5546 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " numper numpanh15 numpanh16 numpanh17 numpanh18 numpanh19 mes_15 \\\n", + "0 15368 1158 1158 1158 1158 1158 11 \n", + "1 15369 1158 1158 1158 1158 1158 11 \n", + "2 15380 1162 1162 1162 1162 1162 11 \n", + "3 15381 1162 1162 1162 1162 1162 11 \n", + "4 15410 1185 1185 1185 1185 1185 7 \n", + "... ... ... ... ... ... ... ... \n", + "5541 34848 18549 18549 18549 18549 18549 12 \n", + "5542 34846 18549 18549 18549 18549 18549 12 \n", + "5543 34847 18549 18549 18549 18549 18549 12 \n", + "5544 34849 18549 18549 18549 18549 18549 12 \n", + "5545 34850 18549 18549 18549 18549 18549 12 \n", + "\n", + " ubigeo_15 dominio_15 p400a3_15 ... mes_18 ubigeo_18 \\\n", + "0 10101 sierra norte 1946.0 ... 12 10101 \n", + "1 10101 sierra norte 1973.0 ... 12 10101 \n", + "2 10101 sierra norte 1987.0 ... 12 10101 \n", + "3 10101 sierra norte 2009.0 ... 12 10101 \n", + "4 10101 sierra norte 1955.0 ... 7 10101 \n", + "... ... ... ... ... ... ... \n", + "5541 250101 selva 2002.0 ... 12 250101 \n", + "5542 250101 selva 1985.0 ... 12 250101 \n", + "5543 250101 selva 1976.0 ... 12 250101 \n", + "5544 250101 selva 2007.0 ... 12 250101 \n", + "5545 250101 selva 2011.0 ... 12 250101 \n", + "\n", + " dominio_18 p400a3_18 p4022_18 mes_19 ubigeo_19 dominio_19 \\\n", + "0 sierra norte 1946.0 0.0 11 10101 sierra norte \n", + "1 sierra norte 1973.0 0.0 11 10101 sierra norte \n", + "2 sierra norte 1987.0 0.0 11 10101 sierra norte \n", + "3 sierra norte 2009.0 0.0 11 10101 sierra norte \n", + "4 sierra norte 1955.0 0.0 7 10101 sierra norte \n", + "... ... ... ... ... ... ... \n", + "5541 selva 2002.0 0.0 10 250101 selva \n", + "5542 selva 1985.0 0.0 10 250101 selva \n", + "5543 selva 1976.0 0.0 10 250101 selva \n", + "5544 selva 2007.0 0.0 10 250101 selva \n", + "5545 selva 2011.0 1.0 10 250101 selva \n", + "\n", + " p400a3_19 p4022_19 \n", + "0 1946.0 0.0 \n", + "1 1973.0 0.0 \n", + "2 1987.0 1.0 \n", + "3 2009.0 0.0 \n", + "4 1955.0 1.0 \n", + "... ... ... \n", + "5541 2002.0 1.0 \n", + "5542 1985.0 0.0 \n", + "5543 1976.0 1.0 \n", + "5544 2007.0 1.0 \n", + "5545 2011.0 0.0 \n", + "\n", + "[5546 rows x 31 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Leemos la base de datos\n", + "# convert_caterials : se usará para especificar si se deben convertir las variables categóricas del archivo Stata a variables categóricas de Pandas\n", + "panel = pd.read_stata(\"C:/Users/ALICIA/Documents/GitHub/ultima tarea/data/data.dta\",convert_categoricals=False) \n", + "panel" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cce469c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['numper', 'numpanh15', 'numpanh16', 'numpanh17', 'numpanh18',\n", + " 'numpanh19', 'mes_15', 'ubigeo_15', 'dominio_15', 'p400a3_15',\n", + " 'p4022_15', 'mes_16', 'ubigeo_16', 'dominio_16', 'p400a3_16',\n", + " 'p4022_16', 'mes_17', 'ubigeo_17', 'dominio_17', 'p400a3_17',\n", + " 'p4022_17', 'mes_18', 'ubigeo_18', 'dominio_18', 'p400a3_18',\n", + " 'p4022_18', 'mes_19', 'ubigeo_19', 'dominio_19', 'p400a3_19',\n", + " 'p4022_19'],\n", + " dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Nos devuelve una lista de todos los nombres de las columnas \n", + "panel.columns[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9b9d749e", + "metadata": {}, + "outputs": [], + "source": [ + "# Renombramos a las columnas\n", + "panel.rename(columns = {'numpanh15':'numpanh_15', 'numpanh16':'numpanh_16','numpanh17':'numpanh_17','numpanh18':'numpanh_18','numpanh19':'numpanh_19'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f7412527", + "metadata": {}, + "outputs": [], + "source": [ + "# Crearemos una lista llamada filter_list que contiene los nombres de todas las columnas del DataFrame llamado \"panel\"\n", + "filter_list = list(panel.columns)[:] " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7e38e765", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['numper',\n", + " 'numpanh_15',\n", + " 'numpanh_16',\n", + " 'numpanh_17',\n", + " 'numpanh_18',\n", + " 'numpanh_19',\n", + " 'mes_15',\n", + " 'ubigeo_15',\n", + " 'dominio_15',\n", + " 'p400a3_15',\n", + " 'p4022_15',\n", + " 'mes_16',\n", + " 'ubigeo_16',\n", + " 'dominio_16',\n", + " 'p400a3_16',\n", + " 'p4022_16',\n", + " 'mes_17',\n", + " 'ubigeo_17',\n", + " 'dominio_17',\n", + " 'p400a3_17',\n", + " 'p4022_17',\n", + " 'mes_18',\n", + " 'ubigeo_18',\n", + " 'dominio_18',\n", + " 'p400a3_18',\n", + " 'p4022_18',\n", + " 'mes_19',\n", + " 'ubigeo_19',\n", + " 'dominio_19',\n", + " 'p400a3_19',\n", + " 'p4022_19']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Crearemos una nueva lista llamada new_list que contiene los elementos únicos de la lista filter_list, eliminando cualquier duplicado\n", + "new_list = list(dict.fromkeys(filter_list))\n", + "new_list" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "416a041f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numperperiodnumpanhmesubigeodominiop400a3p4022
0153681511581110101sierra norte1946.00.0
1153691511581110101sierra norte1973.00.0
2153801511621110101sierra norte1987.01.0
3153811511621110101sierra norte2009.01.0
415410151185710101sierra norte1955.00.0
...........................
2772534848191854910250101selva2002.01.0
2772634846191854910250101selva1985.00.0
2772734847191854910250101selva1976.01.0
2772834849191854910250101selva2007.01.0
2772934850191854910250101selva2011.00.0
\n", + "

27730 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " numper period numpanh mes ubigeo dominio p400a3 p4022\n", + "0 15368 15 1158 11 10101 sierra norte 1946.0 0.0\n", + "1 15369 15 1158 11 10101 sierra norte 1973.0 0.0\n", + "2 15380 15 1162 11 10101 sierra norte 1987.0 1.0\n", + "3 15381 15 1162 11 10101 sierra norte 2009.0 1.0\n", + "4 15410 15 1185 7 10101 sierra norte 1955.0 0.0\n", + "... ... ... ... ... ... ... ... ...\n", + "27725 34848 19 18549 10 250101 selva 2002.0 1.0\n", + "27726 34846 19 18549 10 250101 selva 1985.0 0.0\n", + "27727 34847 19 18549 10 250101 selva 1976.0 1.0\n", + "27728 34849 19 18549 10 250101 selva 2007.0 1.0\n", + "27729 34850 19 18549 10 250101 selva 2011.0 0.0\n", + "\n", + "[27730 rows x 8 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Transformaremos de los datos en un formato más largo (long format)\n", + "# reset_index : Lo utilizaremos para reiniciar el índice del DataFrame resultante\n", + "\n", + "reshape_panel = pd.wide_to_long(panel, stubnames = ['numpanh','mes','ubigeo','dominio','p400a3','p4022'], i = ['numper'] , \n", + " j = 'period' , sep = '_').reset_index()\n", + "reshape_panel" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e9ab148b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ubigeodistritoprovinciaregionunidos
010101ChachapoyasChachapoyasAmazonas1
110102AsuncionChachapoyasAmazonas1
210103BalsasChachapoyasAmazonas0
310104ChetoChachapoyasAmazonas1
410105ChiliquinChachapoyasAmazonas0
..................
1869250302IrazolaPadre AbadUcayali0
1870250303CurimanaPadre AbadUcayali0
1871250304NeshuyaPadre AbadUcayali0
1872250305Alexander von HumboldtPadre AbadUcayali0
1873250401PurusPurusUcayali1
\n", + "

1874 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " ubigeo distrito provincia region unidos\n", + "0 10101 Chachapoyas Chachapoyas Amazonas 1\n", + "1 10102 Asuncion Chachapoyas Amazonas 1\n", + "2 10103 Balsas Chachapoyas Amazonas 0\n", + "3 10104 Cheto Chachapoyas Amazonas 1\n", + "4 10105 Chiliquin Chachapoyas Amazonas 0\n", + "... ... ... ... ... ...\n", + "1869 250302 Irazola Padre Abad Ucayali 0\n", + "1870 250303 Curimana Padre Abad Ucayali 0\n", + "1871 250304 Neshuya Padre Abad Ucayali 0\n", + "1872 250305 Alexander von Humboldt Padre Abad Ucayali 0\n", + "1873 250401 Purus Purus Ucayali 1\n", + "\n", + "[1874 rows x 5 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Llamaremos a una nueva base de datos\n", + "unidos = pd.read_stata(\"C:/Users/ALICIA/Documents/GitHub/ultima tarea/data/unidos.dta\",\n", + " convert_categoricals=False)\n", + "unidos" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a13973a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numperperiodnumpanhmesubigeodominiop400a3p4022distritoprovinciaregionunidos
0153681511581110101sierra norte1946.00.0ChachapoyasChachapoyasAmazonas1
1153691511581110101sierra norte1973.00.0ChachapoyasChachapoyasAmazonas1
2153801511621110101sierra norte1987.01.0ChachapoyasChachapoyasAmazonas1
3153811511621110101sierra norte2009.01.0ChachapoyasChachapoyasAmazonas1
415410151185710101sierra norte1955.00.0ChachapoyasChachapoyasAmazonas1
.......................................
277253477719184727220301selva1942.00.0San Jose de SisaEl DoradoSan Martin0
277263477919184737220301selva1988.00.0San Jose de SisaEl DoradoSan Martin0
277273478119184737220301selva2015.00.0San Jose de SisaEl DoradoSan Martin0
277283477819184737220301selva1984.01.0San Jose de SisaEl DoradoSan Martin0
277293478019184737220301selva2010.00.0San Jose de SisaEl DoradoSan Martin0
\n", + "

27730 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " numper period numpanh mes ubigeo dominio p400a3 p4022 \\\n", + "0 15368 15 1158 11 10101 sierra norte 1946.0 0.0 \n", + "1 15369 15 1158 11 10101 sierra norte 1973.0 0.0 \n", + "2 15380 15 1162 11 10101 sierra norte 1987.0 1.0 \n", + "3 15381 15 1162 11 10101 sierra norte 2009.0 1.0 \n", + "4 15410 15 1185 7 10101 sierra norte 1955.0 0.0 \n", + "... ... ... ... ... ... ... ... ... \n", + "27725 34777 19 18472 7 220301 selva 1942.0 0.0 \n", + "27726 34779 19 18473 7 220301 selva 1988.0 0.0 \n", + "27727 34781 19 18473 7 220301 selva 2015.0 0.0 \n", + "27728 34778 19 18473 7 220301 selva 1984.0 1.0 \n", + "27729 34780 19 18473 7 220301 selva 2010.0 0.0 \n", + "\n", + " distrito provincia region unidos \n", + "0 Chachapoyas Chachapoyas Amazonas 1 \n", + "1 Chachapoyas Chachapoyas Amazonas 1 \n", + "2 Chachapoyas Chachapoyas Amazonas 1 \n", + "3 Chachapoyas Chachapoyas Amazonas 1 \n", + "4 Chachapoyas Chachapoyas Amazonas 1 \n", + "... ... ... ... ... \n", + "27725 San Jose de Sisa El Dorado San Martin 0 \n", + "27726 San Jose de Sisa El Dorado San Martin 0 \n", + "27727 San Jose de Sisa El Dorado San Martin 0 \n", + "27728 San Jose de Sisa El Dorado San Martin 0 \n", + "27729 San Jose de Sisa El Dorado San Martin 0 \n", + "\n", + "[27730 rows x 12 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Realizaremos una fusión (merge) entre dos DataFrames: reshape_panel y unidos\n", + "juntos = pd.merge(reshape_panel, unidos, on='ubigeo', how='inner')\n", + "juntos" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee08714c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Labs/tarea6/Grupo_3_Rshape.R b/Labs/tarea6/Grupo_3_Rshape.R new file mode 100644 index 0000000..8c58679 --- /dev/null +++ b/Labs/tarea6/Grupo_3_Rshape.R @@ -0,0 +1,121 @@ +# se instalan las librerías + +install.packages("stringr") +install.packages("rebus") + +library(stringr) +library(rebus) +library(haven) +library(dplyr) +library(tidyverse) +library(tidyr) + +#se lee el archivo data.dta +panel <- read_dta("C:\\Users\\ALICIA\\Documents\\GitHub\\ultima tarea\\data\\data.dta") +panel + +#se visualizan los nombres de las columnas del archivo +columnas <- colnames(panel) + + +#se renombran las columnas específicas en el dataframe 'panel' +panel <- panel %>% + rename(numpanh_15 = numpanh15, # Renombrar columna 'numpanh15' a 'numpanh_15' + numpanh_16 = numpanh16, # Renombrar columna 'numpanh16' a 'numpanh_16' + numpanh_17 = numpanh17, # Renombrar columna 'numpanh17' a 'numpanh_17' + numpanh_18 = numpanh18, # Renombrar columna 'numpanh18' a 'numpanh_18' + numpanh_19 = numpanh19) # Renombrar columna 'numpanh19' a 'numpanh_19' + +#se imprime el dataframe 'panel' actualizado +panel + +#se obtiene la lista de nombres de columnas del dataframe 'panel' +filter_list <- colnames(panel) + +#se crea una nueva lista con valores únicos de la lista de nombres de columnas +new_list <- unique(filter_list) + +#se imprime la nueva lista de nombres de columnas +new_list + + + + +#se obtiene la lista de columnas +columnas <- c('numper', 'numpanh_15', 'numpanh_16', 'numpanh_17', 'numpanh_18', 'numpanh_19', 'mes_15', 'ubigeo_15', 'dominio_15', 'p400a3_15', 'p4022_15', 'mes_16', 'ubigeo_16', 'dominio_16', 'p400a3_16', 'p4022_16', 'mes_17', 'ubigeo_17', 'dominio_17', 'p400a3_17', 'p4022_17', 'mes_18', 'ubigeo_18', 'dominio_18', 'p400a3_18', 'p4022_18', 'mes_19', 'ubigeo_19', 'dominio_19', 'p400a3_19', 'p4022_19') + +#se obtiene el prefijo común en los nombres de las columnas +prefixo <- unique(sub("_.*", "", columnas[-1])) + +#se utiliza reshape y gather +reshape_panel <- panel %>% + gather(key, value, -numper) %>% + separate(key, into = c("variable", "period"), sep = "_", remove = FALSE) %>% + filter(variable %in% c("numpanh", "mes", "ubigeo", "dominio", "p400a3", "p4022")) %>% + select(numper, period, variable, value) %>% + spread(variable, value) + +#se imprime el resultado +print(reshape_panel) + + + + +#se lee el archivo .dta +unidos <- haven::read_dta("C:\\Users\\ALICIA\\Documents\\GitHub\\ultima tarea\\data\\unidos.dta") +unidos + +#se unen ambos archivos mediante lo común que es ubigeo +juntos <- merge(reshape_panel, unidos, by = "ubigeo", all = FALSE) +juntos + + + + +# Asignar etiquetas a las variables en 'reshape_panel' +labels <- c("Número de persona", "Período", "Número de panh", "Mes", "Ubigeo", "Dominio", "P400a3", "P4022") +for (var in names(reshape_panel)) { + attr(reshape_panel[[var]], "label") <- labels[var] +} + +# Se imprime el resultado +print(reshape_panel) + + + + + +#se instala el paquete sjlabelled +if (!require(sjlabelled)) { + install.packages("sjlabelled") +} + +#se carga el paquete sjlabelled +library(sjlabelled) + +#se añade etiquetas a todas las variables en 'juntos' +set_label(juntos$numper) <- "Número de persona" +set_label(juntos$period) <- "Período" +set_label(juntos$numpanh) <- "Número de panh" +set_label(juntos$mes) <- "Mes" +set_label(juntos$ubigeo) <- "Ubigeo" +set_label(juntos$dominio) <- "Dominio" +set_label(juntos$p400a3) <- "P400a3" +set_label(juntos$p4022) <- "P4022" +set_label(juntos$unidos) <- "Etiqueta de 'unidos'" + +#se añaden etiquetas de valores a las columnas 'p4022' y 'unidos' +set_labels(juntos$p4022, labels = c("Valor1", "Valor2", "Valor3", "Valor4", "Valor5")) +set_labels(juntos$unidos, labels = c("Etiqueta1", "Etiqueta2", "Etiqueta3", "Etiqueta4", "Etiqueta5")) + +#se imprime el resultado +print(juntos) + + + +#se añaden etiquetas de valores a las columnas 'p4022' y 'unidos' +juntos$p4022 <- factor(juntos$p4022, levels = c(1, 2, 3, 4, 5), labels = c("Valor1", "Valor2", "Valor3", "Valor4", "Valor5")) +juntos$unidos <- factor(juntos$unidos, levels = c(1, 2, 3, 4, 5), labels = c("Etiqueta1", "Etiqueta2", "Etiqueta3", "Etiqueta4", "Etiqueta5")) + +#se imprime el resultado +print(juntos) diff --git a/Labs/tarea6/jec_data_Grupo3.dta b/Labs/tarea6/jec_data_Grupo3.dta new file mode 100644 index 0000000..1eaecf0 Binary files /dev/null and b/Labs/tarea6/jec_data_Grupo3.dta differ