From fa6aa493e4fc4d026d18e540d5ee4bf258ab4221 Mon Sep 17 00:00:00 2001 From: Ryo-N7 Date: Thu, 20 Sep 2018 12:49:10 +0900 Subject: [PATCH 1/2] - add in english names to jpnprefs dataset --- data-raw/jpnprefs.R | 54 +++++++++++++++++++++++++++++++++++--- data/jpnprefs.rda | Bin 1995 -> 2550 bytes inst/extdata/jpnprefs.rds | Bin 4774 -> 7590 bytes 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/data-raw/jpnprefs.R b/data-raw/jpnprefs.R index 2a9a403..c1dcea1 100644 --- a/data-raw/jpnprefs.R +++ b/data-raw/jpnprefs.R @@ -14,6 +14,10 @@ library(tidyverse) # dplyr # 0.7.6 # tidyr # 0.8.1 # purrr # 0.2.5 +# (stringr) # 1.3.1 + +library(polite) # 0.0.0.9004 + # Japanese ---------------------------------------------------------------- @@ -22,7 +26,7 @@ x <- df <- x %>% - html_nodes(css = "#mw-content-text > div > table.wikitable.sortable") %>% + html_nodes(css = "table.wikitable:nth-child(104)") %>% # css to correct table as wiki page was edited html_table(fill = TRUE) %>% purrr::flatten_df() %>% select(2, 4, 6, 11) %>% @@ -92,10 +96,54 @@ jpnprefs %<>% select(jis_code, prefecture, capital, region, major_island, capital_latitude = latitude, capital_longitude = longitude) %>% as_tibble() +# ---- English region and island names +url <- "https://en.wikipedia.org/wiki/Prefectures_of_Japan" + +session <- bow(url) + +jpn_pref_raw <- scrape(session) %>% + html_nodes("table.wikitable:nth-child(49)") %>% + #.[[1]] %>% + html_table() %>% + purrr::flatten_df() + +jpn_pref_df <- jpn_pref_raw %>% + janitor::clean_names() %>% + select(kanji, region_en = region, major_island_en = major_island) %>% + mutate(region_en = region_en %>% iconv(from = "UTF-8", to = "ASCII//TRANSLIT")) + +# ---- English prefecture and capital names +url2 <- "https://en.wikipedia.org/wiki/List_of_Japanese_prefectures_by_population" + +session2 <- bow(url2) + +jpn_pref2_raw <- scrape(session2) %>% + html_nodes("table.wikitable:nth-child(7)") %>% + #.[[1]] %>% + html_table() %>% + purrr::flatten_df() + +jpn_pref2_df <- jpn_pref2_raw %>% + janitor::clean_names() %>% + select(kanji = japanese, prefecture_en = prefectures, capital_en = capital) %>% + mutate(prefecture_en = prefecture_en %>% iconv(from = "UTF-8", to = "ASCII//TRANSLIT"), + capital_en = capital_en %>% iconv(from = "UTF-8", to = "ASCII//TRANSLIT")) + +# ---- Join with jpnprefs +jpnprefs <- jpnprefs %>% + left_join(jpn_pref_df, by = c("prefecture" = "kanji")) %>% + left_join(jpn_pref2_df, by = c("prefecture" = "kanji")) %>% + select(jis_code, prefecture, capital, region, major_island, + prefecture_en, capital_en, region_en, major_island_en, + capital_latitude, capital_longitude) %>% + as_tibble() + expect_named(jpnprefs, - c("jis_code", "prefecture", "capital", "region", "major_island", "capital_latitude", "capital_longitude")) + c("jis_code", "prefecture", "capital", "region", "major_island", + "prefecture_en", "capital_en", "region_en", "major_island_en", + "capital_latitude", "capital_longitude")) expect_equal(dim(jpnprefs), - c(47, 7)) + c(47, 11)) expect_s3_class(jpnprefs, c("data.frame", "tbl_df")) diff --git a/data/jpnprefs.rda b/data/jpnprefs.rda index c6764bc8fb20b7cc476fdfc9d7eb01d48e669380..9679bedce89f4e0d680145f752bb0b24f0bd1708 100644 GIT binary patch literal 2550 zcmZWkc|6qn8lEv@Oc~4Ajis^HP}X0T(luipV;Ni7;xNRFgA;{RG8m#L%h=~hLzZNz?AhW@_ul{R^T+djp7(v;&-;E2oXNWCI44Crr!eOXClH)?%*OX0 z(+9ua*KD5tCBFZn#pBmU-w*B_&%pJi?~pGB+)$UMg1ajda(g}%B@-?kOdgxx62gu(-Xf++cni5%;3#RM#X zxBn@$E$vGj)05u@{N>)Ew2MXo<~9hy51_cpU{nN^2|%zYFayK{vB3lIfY=5FD**bJ zkp+nW#3Ygd0+&RO0w4ec!7)J6jD7~6*%CX>1lS`m>aQ^n4{@|u=1Vp^1EZ4JWoZT( zgox(nVYgt}z<(laBBR(6`aefz@B9TaIKL44U#-EmY{erEa2Um5q7<3g8Ww`&x0dz9 zF|KT(P5TWfW%)MM-?ThQx^Psl&=+8ip!l~a^lcDxTU!1$`&4jgnRj+xA<@o~&t?H6 zonL53w8L|Y2;dvTh1F%a@1!ecQunKC`tsrCnpfeb_T7J0QKM@s940<(-?=w@!~I2c z-lfo^)jE>Ft9zWtw>1U=2e~uhDqrBAPTcR@ElZa&D9N|U%CBAX)X_x?+Y;iLZ;RBOV&@%?aSn6kW?(qa%{j^vLPPS5q zB+{l`U=kBPm@Z<&DHim4-eiL9k7$_X5$2G2iB*Z0RLqvd?Yl zo%!>BmJ=Ml!VS*(lS_%dBMp}&w7aj1K;Vm}T^jabhP*}4`nqMbn24p44^=iR14Z?NW^bfUeQwA2{S-%5KlnK7?xCdYqTKk9G*q2|Ti@x! z^{OQ9jk~FC&6{(p&&)=sU_7r?_p&^TgBdFsP^tr;S0sO(pDOD3agI}LwEZKKXQ=@j zx1AT()xz%%)l7;@{H7Gfk!5Y>Ys4WXWSf;u+Cvy?382(Vdo&ZG8B$(iJ+ViSkR6wR zsEIGU{l-uzq{bWEF%Vk1+?jK0RXpnmi1rTbZRhNI3v&*M|Dw^J0TsU^!`;!>v!IMj{st?2d^TM!j!8bc*GrK+{b3+nGZtwaCm+}p zj1MT~M7o?w3+@0`8kr9W3+*NwgUkO}%f#*Ii!nYWqW@ zU`a;hP_=phb#k_ua}uLNogj8%< zKhh$dE)g(Ee!*IODL@tDEh!U9IvJ^I-Y~+F?@lh#w7y6JR*wvl%B8u1P?iyoLbSo*M5 zIw2(Y+dY{l2k%$@F}^P_Jm8QYUg*mEAi3ZJHkZJnReo@TBW_6V-r5ZbXv}1U^jq;o z#EK{<`eY6KQA-e|cPYC)q~L4fsxRgCJ`_9eVvcU|Iw6y*uzuaYkpLoFj#?#1P5@dF ziNXj^u~)C4Y4oa(C;db*`f?0Lj{=MQ9NR+!^PCaOgmlOP0_ozA8D=!rEj0zBbAcCK zNlA1K_RFck+NR{CDp#wlY3hy38BYC>)Occ>d0U&;_?s12nkC@UnPxV=3WXMWr2QQq zWQo_q0tb1ZjxH_>qOi-(Kta!KlX$~mHJa_7#!Bl zd$Ee7hMuqTsXw(7yKd>;TmU}06V)qP@i;d4eyrz;VaH#p%@`+rVmx_mF!Dy&pnhoh zj>YUJ9Bb?u-%G-kkkf(j!*M55-skNONv$~@WZ9qY%+~4&5fimWX)22b-M2oq^e5GF zUx#Xp*TXuaCKstOGGh7DfBU&ZnW*`u*^7DFd8!Z5ySw8qqFth%DWJ`QDz`K zb!?F8I{_|pRPO9_jEcqTIUl-z7D!QQI2JbNnxQmkyOI{5U8gXSx~gEWZiTu zV!DY-mUH@?ESu(NwKzu1yJUSQ2Bc@{ z9GVA&zkj_RlJ{GSdECpXePrV$$>gE+Nsg;HrlKfZNADHkxm2GAq-Q6cv-Fyo;boFk-Km7du5bbuRf_r@q3!L z+Fdu-YL~$!J48EqZz<%N(pzv{F7$6K!=Y$DK~Y&Vs2^T|Ho_o#IP}c!p9_stnyZP6 z#j|>fR{kY)3&0B{IXT4*^jL0KkKS*ZZ6Bme~x|NsC0|NsC0|NsBLz5D@j0~QM9-*MpY>DV(Fhgpd ziQ)mGG}8bAGGu9*nrJqr001Yck)e|TFiZ&3BO?$Dj3I#o0WxA336X@rY9mic>QwSi zYE2K+jRBwl0009`G&INn4FCWD(dv4D8VrB{8UO=800Te(000003ZJT)ki==G36lVt zU<5K`z!M3HfB*t$FpUEMBPLBSnG6InU`zyJU;!|VFaS&@CYTAMMw$eYBs5H>CIvR8 zPfbUWrkNurlS4>)n@BW0LroeOMwp(Oo|=K9L8h8%iH%J%2c#Gw=^A2a8UciDL8Cwh z)G`<+r1Y300PSWDytMkF8vI+O% z+O*BIe%#6fd$XFEMepnhF0jPkxj5cCQZ}w>s8F^U`%a;>;}KDslA(P#EhJh1Oce zaVPw8=|hH#>@mF}@6G7k;~7mIxWUqm*yK@KU-xQre(0C|(=Zmy+U$QMxj6lt!BtfwTEHn66AZ9xe>f`2BMf$VRb{UTWNpXLj`)Rzq%@X`w=o7~kY_ z9c8g8`0bGGTJf#QfA$SZlLAH&dBG}(MC>4&lA zNe_STu$vx*@RfNY8xnoj7NcHD_ObqMPC1v}95d{@hZEK8{LEe0n>Z<*)8vhn>G6=> zg^KnFlrg(Q&NasB~8iB&3oW ztup!hMN@hxwD+(l_>MxD;2Q8hrn+_9g&`CIL9u^gO@s&vu`QB7yj^}Mm?eR|YC5b- zTWn<%GHdUtakQ~tAK*!}(@_d@UA>w^^B1?N(uWpe3gv0s*G3laHOrS!88mLR=qERr zTyEp*iNm3us5iHXm5k=K+?_1CKw+99y2i{Mf+8{t+`y~@YB;+aYY6xbj+w)%Ym%pt z8G&seO9~0(10>BF{IL`S2qTpi?+8zO8NzsDx~nRr9ZkH1foU=M)22*7B#arzpJ6FE zll~&WB|K*^a1a**fy+w!DO0f|&{rX0`5Ed+#3Z6BhHvkHq{$Q@LPoTTAu{6dj>fXN znJ8I=rdeNo&ql(rf%SV*Jmf&EojvS2*_drrAHGSnS~!*2^$&mSII^xPOs($7sFrLd z`F8j6$2%smwN-1La-s3pF=ri7H;myEXJ`po80VvrO=>?13>{f=*``yuarc*|B0EFA{#sg{CQ2D$~|AO z0t1{McuAfn%dtHw*nSFtbrdev)joS#qF&7#j!pJ8=z>B`SxHGp4LOgGQBglJe_o0@ zz*U6t5D`ouSTRzS7YCsfm}i7%O+G21;8;Qd!3Dqw6AYmSa3W|*wGvv2P4U)Y)Qlnu zOfwv`LLww3EK`q1I>?(XCvuyMQj3T(&G1kquR`uVYE{m{+7gd)C?2%}arlD2>I|Zq dqBa%WICLDB$sEp`Nh%BeF64@Ep&?QMT1Zh@g^&OM diff --git a/inst/extdata/jpnprefs.rds b/inst/extdata/jpnprefs.rds index 749b63d3b38b4436ad6dd86ad0a3ef89c1cddd8a..82c45e0dab436a5dd67aa76f7b771409dd7ef2df 100644 GIT binary patch literal 7590 zcmeHLdr(x@8Q-8RyX@j)Qf+l&>$GYg4w_|AtF`4^h!2YB7^q^Lmc78TTo?8_%dQk_ z6A=Ls5L8|cAc{PEAmO3%Dkw8e(#F<~O-*8JMmuotvdPpTc2Z4Ewdd~Ld(L;)KboYH z>2&hP!ufuD-}n2z@0{Pcdr1n_Xf#2ZDZ$TZf~Mf*lxOh|1OCjwpQW0q8a@6U^t_3z zBFM^2R*_`2jI5TE)e5ruK3T0)R;K64ig1_+hly~Q2#1Mqm55RM4KVI~}A!eJ&HX2M}69A?5{CLCtMVI~}A!VyV0A_+$% z;fN$0k%S|Xa6}T0$Q5LFHG8Gq)eEIH;>BKX-cgMo@HQL~Pjz_eOBF!qJ|h;FK%k>T zXgmY}Pg{wnvJ(J8SC=r<1OQKKxzKY70OO^n)j7ly`C|QP;Ncl=_O`YIKxq7x=TI>O z%JRn#LqKfk8XIW$<`4NxH+ou*3oQqP-UbjQR(Fa+jQ|i{EyeW$z|+{H4#Cxr4O9US z>@POKV!c(p-Xb^>p}WFcTMt6K4abD;lK>F+*LfOS05H~9>@6w-fH+t(-m@0~a&y8! zJ=vP(HgD}&2ozit2F`&H-2Qm+s}wK_M@q%>0S$Y5T7>dy>ZHeeC7$SqK=(=YjD&%F z^@d7YUVaLO4ELx#ysgL7W5p8?&enkt@3~s_NTg$<8rs-Ug?c1-@WKT+cu(O$v0im- zn%jf}g@8rC#}|eez5SPkf(mu9Vt2KtZ@(|fe;fMcA=aN4`b&kreaarldI5l^t_~Ls zfU&`2LU|_uWW4}D8s1P70A#%YKE zXkO!J-Y|vrf=@U;z<0%KE4Gwpl!!`4;_0*E#d_c`O9lY4WB}kTIxKbvG=L=oJUo@% z_?!YjmJ9%7$pAo>3>q>Y*2!+th$xGGIosWGH~|2-RfuvSesd zDjACDWyt_QY#0{18UY}k0lcl0NR|u)e3F5HPci_&g1~tJAX%!UvSfgVEExdEk^z7$ z834$V0RWyNmJ9@ZlA!>V48`=`eW%7-i+q4uGJ%g6ExxBQ=09#v$?{e=;j~K5$*{)y z&U^DuD-2(>HGcW9r|Po8J{cD7NT^dDo!ZZ93HbJ-Sw6ir*54rX9#{IC3`@S!>GvP` z>Z`UNj5*pbeG|f;snx$IkN$G~zc+_NnPrESefc!&l2N*kPitM%QBr_wfbT}S^}OBA za%sFqepd=*_$=PZh1;zTKcbJ#VO>@LgvN8ZtWD*@*f|#qT#Vb;EY`-Vu&}jmyE{9B z%Yu%E*c8^u+F>$7BA=SUsq?RMJJewbEGASz`j`wZ1*X;|^7dRl+@*A8jAI?RrX1*F z*vN4KWrim5x$4ZuSX{MT9jKR@fmMe|9XnX;3C0k*5feIifaun7>26>f6Ee8nZr%=y z(lZ<$0B{P4=ivPHIK~D8baA=7iw{qC`*%#YDVxO^iLGCs%iB~{Y{Wwc1jen{KcH*8 z4KA1ZBp5I;ZW7jF*z`nT*m};%<2k5y$2T6ETisU}-k~Q>L}*;#Ch6DUWRO%JC!G*b zzd_m!^$>LlI1O+yN?U=)0>g}P?kqM-8XZ0Dx=jHmSvu^y)p23C#3wq$li(c6AyI40 zkihQT$vgRQJe2^MQN$3Bhb{F$q0j`YBaP(%qL+n6U8v9~282dIT|9^Tr1rF*xU$_2 zJ_~mTP-8r6P4OLK>WQnDwMHSo)+ivI495<<3E_6-5E{4fc0NN=5^z)GMj=IR6!GbF zw=_wL`Q=6dT^x^fL~TWB6p@uiZDWF>G>S=4SnVKBNMbfjtJ{R91pcg-l}0n@WvNjB z7nWiL5jvmL=-;W-C`P14q0o31Gs8qgY19cQ4S>F@*QYc9Ql&uW9JP%^a1;ZAqadDahP-uZwFN%Ww26EMzGwWxdpcGy zewL62%TqRDGTbRjwZUJIp~d}Vv&!W(8CD4H#74K_zcLf>;~1p+ET%Rt7e8^7@lK}o z9}eYu-T~_P(o*F4e=v!V&vGCGoG0@{%3tuOy_oB^=ep;|W@oaFv~2u3Fb#i}qBTYL zk}fYSMK5qG;$Cl?KruJ}yz+NPN>R)=H|GR>v=PPJn*Zy%mQfTlzq4xXYco*vU(>D~ zzu`pDOWrz>wJHQfuQM!i{i7X4o8GP(T(=4_cVpMwGTcDS?}z3@uXu=#BA+3Soy~o#LU|}Z~npUh`I8Oztkj5N6eu2M@PL;h)KBpgS40CA;z>Z zEMm9|F|*$+Wd7BU7~P{=1GT#lv-r+}jP?%^`oP{472S@|8-){B?pGmnRcebv=;YwK zl}9!sWYN!EygDACbU7X!KmBe~a1e?b|Jj+5)+7{FdwF5Wy=`dKa?8X9x1G$NJ-6j= z&xx^!t&!_tn{oQeTEx~}ul&|4XAqme##Vgp8e$K=$8EphLTvfM*Ueed?`OBx-C2Ox zJy9R>18*TVMSCncsQ|IA)+1kCb{EIlW_`E05V5=RcDHGdVcc$e6n-DE?DyWdx-1y6 zi*ASy^m7qAD`+pC7-B>G@h>i&+c=zq*tIKW793rL*suBP+wenSk;aYKd74$sS1uxU zM)_acjs)74iRjx7H9 zSxz20Z|yyWEJKc(vQ4*<^uEU-%j(%9nX^ZbWsBUuCHv)qq>pXLl9j!n>8=)8Hs5f% zZ+D{P4|dI&c(@Y($0P{v!888*NAKiwmN?ift0ast{0Z8Ne>{Ht_e~K`gBAL9}hx)TnL@y_zz;u|4!ZVb} WOxDG@+@#W(1m_(#`41Y+qkjXS>JV)J delta 812 zcmc)IyGjE=6vpw{OLjNWxSE?u+-TIqOVs!PR$5zFDcS{zRg7Rn>{hT6EEWc=#KI!g zBFYwmjgR0XXtPP_8>s)a@C`J@4`w)Xm~V#DUF%l8tYSXq-W69%3L#dZK!pY!F&Hpm zA&vxwFpQ{g1U5!7h7{7sAd4Kvkw*bVOu#`2lVVqhDHdf+V+OORpo%%nV*!h(VF}Av zK^=tGqu>g$qmV|cdD?7sN&8D4_!8_ynEw^Pq#w+)Q#T1@7%#&@4Ewiy&rD>&F9O_#GTRiF8kg^uwF1URsE8# avmv*fTC&|ZY<8H2XqZj;>g>t0QvDmjC)aWS From 90b6c8b13c536d5bf5346b89aea02654f0e39918 Mon Sep 17 00:00:00 2001 From: Ryo-N7 Date: Thu, 20 Sep 2018 23:25:54 +0900 Subject: [PATCH 2/2] edits to comply with PR review --- data-raw/jpnprefs.R | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/data-raw/jpnprefs.R b/data-raw/jpnprefs.R index c1dcea1..2e49bd1 100644 --- a/data-raw/jpnprefs.R +++ b/data-raw/jpnprefs.R @@ -14,11 +14,6 @@ library(tidyverse) # dplyr # 0.7.6 # tidyr # 0.8.1 # purrr # 0.2.5 -# (stringr) # 1.3.1 - -library(polite) # 0.0.0.9004 - - # Japanese ---------------------------------------------------------------- x <- @@ -99,33 +94,27 @@ jpnprefs %<>% # ---- English region and island names url <- "https://en.wikipedia.org/wiki/Prefectures_of_Japan" -session <- bow(url) - -jpn_pref_raw <- scrape(session) %>% +jpn_pref_raw <- read_html(url) %>% html_nodes("table.wikitable:nth-child(49)") %>% - #.[[1]] %>% html_table() %>% purrr::flatten_df() jpn_pref_df <- jpn_pref_raw %>% - janitor::clean_names() %>% - select(kanji, region_en = region, major_island_en = major_island) %>% + select(2, 4, 5) %>% + set_colnames(c("kanji", "region_en", "major_island_en")) %>% mutate(region_en = region_en %>% iconv(from = "UTF-8", to = "ASCII//TRANSLIT")) # ---- English prefecture and capital names url2 <- "https://en.wikipedia.org/wiki/List_of_Japanese_prefectures_by_population" -session2 <- bow(url2) - -jpn_pref2_raw <- scrape(session2) %>% +jpn_pref2_raw <- read_html(url2) %>% html_nodes("table.wikitable:nth-child(7)") %>% - #.[[1]] %>% html_table() %>% purrr::flatten_df() jpn_pref2_df <- jpn_pref2_raw %>% - janitor::clean_names() %>% - select(kanji = japanese, prefecture_en = prefectures, capital_en = capital) %>% + select(3, 2, 4) %>% + set_colnames(c("kanji", "prefecture_en", "capital_en")) %>% mutate(prefecture_en = prefecture_en %>% iconv(from = "UTF-8", to = "ASCII//TRANSLIT"), capital_en = capital_en %>% iconv(from = "UTF-8", to = "ASCII//TRANSLIT"))