www.kaggle.com

Kaggleで公開されていたStack Overflowのサーベイデータより、Rユーザーの属性などをみてみたいと思います。

「世界の」とタイトルについていますが、Stack Overflowが集計したので、比較的「英語を話せる人が多い国のRユーザー」のデータになっていると感じました。

stack = read.csv("data/survey_results_public.csv")
names(stack)

  [1] "Respondent"                  "Hobby"                       "OpenSource"                  "Country"                     "Student"                    
  [6] "Employment"                  "FormalEducation"             "UndergradMajor"              "CompanySize"                 "DevType"                    
 [11] "YearsCoding"                 "YearsCodingProf"             "JobSatisfaction"             "CareerSatisfaction"          "HopeFiveYears"              
 [16] "JobSearchStatus"             "LastNewJob"                  "AssessJob1"                  "AssessJob2"                  "AssessJob3"                 
 [21] "AssessJob4"                  "AssessJob5"                  "AssessJob6"                  "AssessJob7"                  "AssessJob8"                 
 [26] "AssessJob9"                  "AssessJob10"                 "AssessBenefits1"             "AssessBenefits2"             "AssessBenefits3"            
 [31] "AssessBenefits4"             "AssessBenefits5"             "AssessBenefits6"             "AssessBenefits7"             "AssessBenefits8"            
 [36] "AssessBenefits9"             "AssessBenefits10"            "AssessBenefits11"            "JobContactPriorities1"       "JobContactPriorities2"      
 [41] "JobContactPriorities3"       "JobContactPriorities4"       "JobContactPriorities5"       "JobEmailPriorities1"         "JobEmailPriorities2"        
 [46] "JobEmailPriorities3"         "JobEmailPriorities4"         "JobEmailPriorities5"         "JobEmailPriorities6"         "JobEmailPriorities7"        
 [51] "UpdateCV"                    "Currency"                    "Salary"                      "SalaryType"                  "ConvertedSalary"            
 [56] "CurrencySymbol"              "CommunicationTools"          "TimeFullyProductive"         "EducationTypes"              "SelfTaughtTypes"            
 [61] "TimeAfterBootcamp"           "HackathonReasons"            "AgreeDisagree1"              "AgreeDisagree2"              "AgreeDisagree3"             
 [66] "LanguageWorkedWith"          "LanguageDesireNextYear"      "DatabaseWorkedWith"          "DatabaseDesireNextYear"      "PlatformWorkedWith"         
 [71] "PlatformDesireNextYear"      "FrameworkWorkedWith"         "FrameworkDesireNextYear"     "IDE"                         "OperatingSystem"            
 [76] "NumberMonitors"              "Methodology"                 "VersionControl"              "CheckInCode"                 "AdBlocker"                  
 [81] "AdBlockerDisable"            "AdBlockerReasons"            "AdsAgreeDisagree1"           "AdsAgreeDisagree2"           "AdsAgreeDisagree3"          
 [86] "AdsActions"                  "AdsPriorities1"              "AdsPriorities2"              "AdsPriorities3"              "AdsPriorities4"             
 [91] "AdsPriorities5"              "AdsPriorities6"              "AdsPriorities7"              "AIDangerous"                 "AIInteresting"              
 [96] "AIResponsible"               "AIFuture"                    "EthicsChoice"                "EthicsReport"                "EthicsResponsible"          
[101] "EthicalImplications"         "StackOverflowRecommend"      "StackOverflowVisit"          "StackOverflowHasAccount"     "StackOverflowParticipate"   
[106] "StackOverflowJobs"           "StackOverflowDevStory"       "StackOverflowJobsRecommend"  "StackOverflowConsiderMember" "HypotheticalTools1"         
[111] "HypotheticalTools2"          "HypotheticalTools3"          "HypotheticalTools4"          "HypotheticalTools5"          "WakeTime"                   
[116] "HoursComputer"               "HoursOutside"                "SkipMeals"                   "ErgonomicDevices"            "Exercise"                   
[121] "Gender"                      "SexualOrientation"           "EducationParents"            "RaceEthnicity"               "Age"                        
[126] "Dependents"                  "MilitaryUS"                  "SurveyTooLong"               "SurveyEasy"

とても質問項目が多いですが、66項目のLanguageWorkedWithに注目し、Rユーザーに絞ってみました。

stack$LanguageWorkedWith %>% head

[1] JavaScript;Python;HTML;CSS                       JavaScript;Python;Bash/Shell                     <NA>                                            
[4] C#;JavaScript;SQL;TypeScript;HTML;CSS;Bash/Shell C;C++;Java;Matlab;R;SQL;Bash/Shell               Java;JavaScript;Python;TypeScript;HTML;CSS      
26678 Levels: Assembly Assembly;C Assembly;C;Bash/Shell ... Visual Basic 6;HTML

ヘッドを眺めてみると、使ったことがある言語がセミコロンで区切られています。なので… str_detect(LanguageWorkedWith, "R")が使えそう。ですが、他にも大文字Rで始まる言語あるのでそこらへんをうまく編集して切り取ってみました。

stack$LanguageWorkedWith = stack$LanguageWorkedWith %>% 
  str_replace("Ruby", "ruby") %>% 
  str_replace("Rust", "ruby")
r_users = stack %>% 
  filter(str_detect(LanguageWorkedWith, c("R;", ";R", "R")))

これで、R言語を使ったことのあるユーザーのみに絞れたので、それをr_usersというデータにストアしました。しかし、ここにはかなり不安があるので、より良い解決策を知っている人は教えてほしいです。

EDA

国籍

data.frame(table(r_users$Country)) %>% 
  arrange(desc(Freq)) %>% 
  filter(Freq != 0) %>% 
  top_n(30) %>% 
  ggplot(aes(reorder(Var1, Freq), Freq, fill = Var1, label = Freq))+
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_label(show.legend = F) + 
  coord_flip() +
  labs(x = "", y = "count") +
  theme(axis.text.y = element_text(size = 15))

USが抜群に多いです。日本はtop 30にも入っていないですね。次はUSを抜いて地図でプロットしてみました。

country_dat = data.frame(table(r_users$Country)) 
world <- map_data("world") %>% 
  filter(region != "Antarctica")
ggplot() + 
  geom_map(data = world, map = world,
           aes(x=long, y=lat, group=group, map_id=region),
                  fill="white", colour="black") +
  geom_map(data=country_dat, map=world,
                  aes(fill=Freq, map_id=Var1),
                  colour="black") +
  scale_fill_continuous(low="red", high="yellow", guide="colorbar",
                        limits=c(0,400)) +
  labs(title = "Data on USA intentionally excluded")

やはり英語圏のカナダやオーストラリアは多めで人口の多いブラジルも多めです。

ここまでの地理データをみると、rタグがついてる質問になるべく早く答えたければUSとインドの人がオンラインで活発な時間を避けたほうがよさそうですね。

学生

pie(data.frame(table(r_users$Student))$Freq, data.frame(table(r_users$Student))$Var1)

個人的に興味がありましたが、学生は30~40%くらいっぽそう。

年齢

r_users %>% 
  ggplot(aes(Age, fill = Age)) +
  geom_bar(stat = "count", show.legend = F)

30代前半までが年齢の多くを占めている。

最終学歴

r_users %>% 
  ggplot(aes(FormalEducation, fill = FormalEducation)) +
  geom_bar(stat = "count", show.legend = F) +
  coord_flip() +
  labs(x = "") +
  theme(axis.text.y = element_text(angle = 20))

まとめ

英語を主体としたプラットフォームなので、あまりアジア圏の人間(特に日本人)に示唆のあるレポートは作れなかったかもしれないです。しかし、Stack Overflowで今後ともがつがつやっていきたい人はもっと興味のある質問項目なんかを分析していっても面白いと思います。さらに世界のKagglerさんたちがすでに多くのKernelを公開しているのでそちらをみてみるのも良いかと思います。

ちなみに先日Stack Overflowのデータサイエンティストが自社で使っているデータをPCAなどを使って分析している動画があがっていたので興味のある人はそちらをみても面白いと思います。

youtu.be