【競馬】【再実行】rvestでyahoo競馬にある過去のレース結果をスクレイピングしてみた。 - 東京の投資顧問会社で働く社会人が研究に没頭するブログ

今回で3回目になりますが、またまたrvestで過去のレース結果を落としてみたいと思います。過去の記事を見てないという人は先にそちらをご覧になられることをお勧めします。

osashimix.hatenablog.com

今回データを取り直そうと思ったのは、競馬の分析をした際により多くの項目を説明変数に加えて、分析をしたいと思ったからです。なので、今回は前回のRスクリプトに追記を行う形でプログラムを作成しました（前回も前々回のスクリプトに付け足しただけですが）。新たに追加したデータ項目は以下の14個です。

芝かダートか
右回りか左回りか
レースコンディション（良や稍重など）
天候
馬の毛色（栗毛、鹿毛など）
馬主
生産者
産地
生年月日
父馬
母馬
そのレースまでの獲得賞金（2003年から入手可能）
ジョッキーの体重
ジョッキーの体重の増減

実はまだデータ収集は終わっていなくて、Rのプログラムがずっと実行中になっています（3日くらい回しています）。しかし、プログラム自体はきっちり回っているのでスクリプトの紹介をしていこうと思います。もしかしたら追記で結果を書くかもしれません。

スクリプトの中身

まずはパッケージの呼び出しです。

# rvestによる競馬データのwebスクレイピング

#install.packages("rvest")
#if (!require("pacman")) install.packages("pacman")
#install.packages("beepr")
#install.packages("RSQLite")
pacman::p_load(qdapRegex)
library(rvest)
library(stringr)
library(dplyr)
library(beepr)
library(RSQLite)

かなりwarnningが出るのでそれを禁止し、SQLiteに接続しています

# warnning禁止
options(warn=-1)

# SQLiteへの接続
con = dbConnect(SQLite(), "horse_data.db", synchronous="off")

1994年からしかオッズが取れないので、1994年から直近までのデータを取得します。yahoo競馬では月ごとにレースがまとめられているので、それを変数として使用しながらデータをとっていきます。基本的には、該当年、該当月のレース結果一覧へアクセスし、そのページ上の各日の個々の競馬場ごとのタイムテーブルへのリンクを取得します。個々の競馬場でレースはだいたい12ほどあるので、そのリンクを取得し、各レースのレース結果ページにアクセスします。そして、レース結果を取得していきます。まず、各日の個々の競馬場ごとのタイムテーブルへのリンクの取得方法です。

for(year in 1994:2018){
  start.time <- Sys.time() # 計算時間を図る
  # yahoo競馬のレース結果一覧ページの取得
  for (k in 1:12){ # kは月を表す
    
    tryCatch(
      {
        keiba.yahoo <- read_html(str_c("https://keiba.yahoo.co.jp/schedule/list/", year,"/?month=",k)) # 該当年、該当月のレース結果一覧にアクセス
        Sys.sleep(2)
        race_lists <- keiba.yahoo %>%
          html_nodes("a") %>% 
          html_attr("href") # 全urlを取得
        
        # 競馬場ごとの各日のレースリストを取得
        race_lists <- race_lists[str_detect(race_lists, pattern="race/list/\\d+/")==1] # 「result」が含まれるurlを抽出
      }
      , error = function(e){signal <- 1}
    )

ここでは、取得したリンクのurlにresultという文字が含まれているものだけを抽出しています。要はそれが各競馬場のレーステーブルへのリンクとなります。ここからは取得した競馬場のレーステーブルのリンクを用いて、そのページにアクセスし、全12レースそれぞれのレース結果が掲載されているページのリンクを取得していきます。

    for (j in 1:length(race_lists)){ # jは当該年月にあったレーステーブルへのリンクを表す
      
      tryCatch(
        {
          race_list <- read_html(str_c("https://keiba.yahoo.co.jp",race_lists[j]))
          race_url <- race_list %>% html_nodes("a") %>% html_attr("href") # 全urlを取得
          
          # レース結果のurlを取得
          race_url <- race_url[str_detect(race_url, pattern="result")==1] # 「result」が含まれるurlを抽出
        }
        , error = function(e){signal <- 1}
      )

各レース結果へのリンクが取得できたので、ここからはいよいよレース結果の取得とその整形パートに入ります。かなり長ったらしく複雑なコードになってしまいました。レース結果は以下のようなテーブル属性に格納されているので、まずそれを単純に引っ張ってきます。 f:id:osashimix:20180814213644p:plain

      for (i in 1:length(race_url)){ # iは当該年月当該競馬場で開催されたレースを表す
        
        print(str_c("現在、", year, "年", k, "月",j, "グループ、", i,"番目のレースの保存中です"))
        
        tryCatch(
          {
            race1 <-  read_html(str_c("https://keiba.yahoo.co.jp",race_url[i])) # レース結果のurlを取得
            signal <- 0
            Sys.sleep(2)
          }
          , error = function(e){signal <- 1}
        )
        
        # レースが中止orこれまでの過程でエラーでなければ処理を実行
        if (identical(race1 %>%
                      html_nodes(xpath = "//div[@class = 'resultAtt mgnBL fntSS']") %>%
                      html_text(),character(0)) == TRUE && signal == 0){
          
          # レース結果をスクレイピング
          race_result <- race1 %>% html_nodes(xpath = "//table[@id = 'raceScore']") %>%
            html_table()
          race_result <- do.call("data.frame",race_result) # リストをデータフレームに変更
          
          colnames(race_result) <- c("order","frame_number","horse_number","horse_name/age","time/margin","passing_rank/last_3F","jockey/weight","popularity/odds","trainer") #　列名変更

tableをただ取得しただけでは以下のように\nが入っていたり、一つのセルに複数の情報が入っていたりと分析には使えないデータとなっています。

> head(race_result)
      order frame_number horse_number                         horse_name/age     time/margin
    1     1            4            5     ヤマカツエース\n    \n牡4/492(+6)/          2.01.2
    2     2            5            7   マイネルフロスト\n    \n牡5/488(+4)/   2.01.33/4馬身
    3     3            6           10         フルーキー\n    \n牡6/490(+6)/   2.01.43/4馬身
    4     4            8           14 ライズトゥフェイム\n    \n牡6/488(+4)/ 2.01.61 1/4馬身
    5     5            3            3     ステラウインド\n    \n牡7/490(+4)/    2.01.6アタマ
    6     6            4            6 ブライトエンブレム\n    \n牡4/492(+4)/    2.01.6アタマ
      passing_rank/last_3F         jockey/weight popularity/odds   trainer
    1      05-05-04-0233.0   池添 謙一\n    56.0      3    (4.9) 池添 兼雄
    2      01-01-01-0134.4   松岡 正海\n    57.0      5    (6.9)   高木 登
    3      08-09-08-0832.7  M.デムーロ\n    57.5      1    (3.4) 角居 勝彦
    4      10-11-10-1032.6 石川 裕紀人\n    56.0     6    (12.4) 加藤 征弘
    5      03-03-03-0233.6   蛯名 正義\n    56.0     7    (17.2) 尾関 知人
    6      06-06-06-0633.2  C.ルメール\n    56.0      2    (4.8) 小島 茂之

なので、これを成型する必要が出てきます。

          # 通過順位と上り3Fのタイム
          race_result <- dplyr::mutate(race_result,passing_rank=as.character(str_extract_all(race_result$`passing_rank/last_3F`,"(\\d{2}-\\d{2}-\\d{2}-\\d{2})|(\\d{2}-\\d{2}-\\d{2})|(\\d{2}-\\d{2})")))
          race_result <- dplyr::mutate(race_result,last_3F=as.character(str_extract_all(race_result$`passing_rank/last_3F`,"\\d{2}\\.\\d")))
          race_result <- race_result[-6]
          
          # タイムと着差
          race_result <- dplyr::mutate(race_result,time=as.character(str_extract_all(race_result$`time/margin`,"\\d\\.\\d{2}\\.\\d|\\d{2}\\.\\d")))
          race_result <- dplyr::mutate(race_result,margin=as.character(str_extract_all(race_result$`time/margin`,"./.馬身|.馬身|.[:space:]./.馬身|[ア-ン-]+")))
          race_result$margin[race_result$order==1] <- "トップ"
          race_result$margin[race_result$margin=="character(0)"] <- "大差"
          race_result$margin[race_result$order==0] <- NA
          race_result <- race_result[-5]
          
          # 馬名、馬齢、馬体重
          race_result <- dplyr::mutate(race_result,horse_name=as.character(str_extract_all(race_result$`horse_name/age`,"[ァ-ヴー・]+")))
          race_result <- dplyr::mutate(race_result,horse_age=as.character(str_extract_all(race_result$`horse_name/age`,"牡\\d+|牝\\d+|せん\\d+")))
          race_result$horse_sex <- str_extract(race_result$horse_age, pattern = "牡|牝|せん")
          race_result$horse_age <- str_extract(race_result$horse_age, pattern = "\\d")
          race_result <- dplyr::mutate(race_result,horse_weight=as.character(str_extract_all(race_result$`horse_name/age`,"\\d{3}")))
          race_result <- dplyr::mutate(race_result,horse_weight_change=as.character(str_extract_all(race_result$`horse_name/age`,"\\([\\+|\\-]\\d+\\)|\\([\\d+]\\)")))
          race_result$horse_weight_change <- sapply(rm_round(race_result$horse_weight_change, extract=TRUE), paste, collapse="")
          race_result <- dplyr::mutate(race_result,brinker=as.character(str_extract_all(race_result$`horse_name/age`,"B")))
          race_result$brinker[race_result$brinker!="B"] <- "N"
          race_result <- race_result[-4]
          
          # ジョッキー
          race_result <- dplyr::mutate(race_result,jockey=as.character(str_extract_all(race_result$`jockey/weight`,"[ぁ-ん一-龠]+\\s[ぁ-ん一-龠]+|[:upper:].[ァ-ヶー]+")))
          race_result <- dplyr::mutate(race_result,jockey_weight=as.character(str_extract_all(race_result$`jockey/weight`,"\\d{2}")))
          race_result$jockey_weight_change <- 0
          race_result$jockey_weight_change[str_detect(race_result$`jockey/weight`,"☆")==1] <- 1
          race_result$jockey_weight_change[str_detect(race_result$`jockey/weight`,"△")==1] <- 2
          race_result$jockey_weight_change[str_detect(race_result$`jockey/weight`,"△")==1] <- 3
          race_result <- race_result[-4]
          
          # オッズと人気
          race_result <- dplyr::mutate(race_result,odds=as.character(str_extract_all(race_result$`popularity/odds`,"\\(.+\\)")))
          race_result <- dplyr::mutate(race_result,popularity=as.character(str_extract_all(race_result$`popularity/odds`,"\\d+[^(\\d+.\\d)]")))
          race_result$odds <- sapply(rm_round(race_result$odds, extract=TRUE), paste, collapse="")
          race_result <- race_result[-4]

次に、今取得したtable以外の情報も取り込むことにします。具体的には、レース名や天候、馬場状態、日付、競馬場などです。これらの情報はレース結果ページの上部に掲載されています。 f:id:osashimix:20180814215142p:plain

          # レース情報
          race_date <- race1 %>% html_nodes(xpath = "//div[@id = 'raceTitName']/p[@id = 'raceTitDay']") %>%
            html_text()
          race_name <- race1 %>% html_nodes(xpath = "//div[@id = 'raceTitName']/h1[@class = 'fntB']") %>%
            html_text()
          race_distance <- race1 %>% html_nodes(xpath = "//p[@id = 'raceTitMeta']") %>%
            html_text()
        
          race_result <- dplyr::mutate(race_result,race_date=as.character(str_extract_all(race_date,"\\d+年\\d+月\\d+日")))
          race_result$race_date <- str_replace_all(race_result$race_date,"年","/")
          race_result$race_date <- str_replace_all(race_result$race_date,"月","/")
          race_result$race_date <- as.Date(race_result$race_date)
          race_course <- as.character(str_extract_all(race_date,pattern = "札幌|函館|福島|新潟|東京|中山|中京|京都|阪神|小倉"))
          race_result$race_course <- race_course
          race_result <- dplyr::mutate(race_result,race_name=as.character(str_replace_all(race_name,"\\s","")))
          race_result <- dplyr::mutate(race_result,race_distance=as.character(str_extract_all(race_distance,"\\d+m")))
          race_type=as.character(str_extract_all(race_distance,pattern = "芝|ダート"))
          race_result$type <- race_type
          race_turn <- as.character(str_extract_all(race_distance,pattern = "右|左"))
          race_result$race_turn <- race_turn
          
          if(length(race1 %>% html_nodes(xpath = "//img[@class = 'spBg ryou']")) == 1){
            race_result$race_condition <- "良"
          } else if (length(race1 %>% 
                            html_nodes(xpath = "//img[@class = 'spBg yayaomo']")) == 1){
            race_result$race_condition <- "稍重"
          } else if (length(race1 %>%
                            html_nodes(xpath = "//img[@class = 'spBg omo']")) == 1){
            race_result$race_condition <- "重"
          } else if (length(race1 %>% 
                            html_nodes(xpath = "//img[@class = 'spBg furyou']")) == 1){
            race_result$race_condition <- "不良"
          } else race_result$race_condition <- "NA"
          
          if (length(race1 %>% html_nodes(xpath = "//img[@class = 'spBg hare']")) == 1){
            race_result$race_weather <- "晴れ"
          } else if (length(race1 %>% html_nodes(xpath = "//img[@class = 'spBg ame']")) == 1){
            race_result$race_weather <- "曇り"
          } else if (length(race1 %>% html_nodes(xpath = "//img[@class = 'spBg kumori']")) == 1){
            race_result$race_weather <- "雨"
          } else race_result$race_weather <- "その他"

次は各馬の情報です。実はさきほど取得したtableの馬名はリンクになっており、そのリンクをたどると各馬の情報が取得できます（毛色や生年月日など）。

          horse_url <- race1 %>% html_nodes("a") %>% html_attr("href") 
          horse_url <- horse_url[str_detect(horse_url, pattern="directory/horse")==1] # 馬情報のリンクだけ抽出する
          
          for (l in 1:length(horse_url)){
            tryCatch(
              {
                horse1 <-  read_html(str_c("https://keiba.yahoo.co.jp",horse_url[l]))
                Sys.sleep(0.5)
                horse_name <- horse1 %>% html_nodes(xpath = "//div[@id = 'dirTitName']/h1[@class = 'fntB']") %>% 
                  html_text()
                horse <- horse1 %>% html_nodes(xpath = "//div[@id = 'dirTitName']/ul") %>% 
                  html_text()
                race_result$colour[race_result$horse_name==horse_name] <- as.character(str_extract_all(horse,"毛色：.+")) 
                race_result$owner[race_result$horse_name==horse_name] <- as.character(str_extract_all(horse,"馬主：.+"))
                race_result$farm[race_result$horse_name==horse_name] <- as.character(str_extract_all(horse,"生産者：.+"))
                race_result$locality[race_result$horse_name==horse_name] <- as.character(str_extract_all(horse,"産地：.+"))
                race_result$horse_birthday[race_result$horse_name==horse_name] <- as.character(str_extract_all(horse,"\\d+年\\d+月\\d+日"))
                race_result$father[race_result$horse_name==horse_name] <- horse1 %>% html_nodes(xpath = "//td[@class = 'bloodM'][@rowspan = '4']") %>% html_text()
                race_result$mother[race_result$horse_name==horse_name] <- horse1 %>% html_nodes(xpath = "//td[@class = 'bloodF'][@rowspan = '4']") %>% html_text()
              }
              , error = function(e){
                race_result$colour[race_result$horse_name==horse_name] <- NA 
                race_result$owner[race_result$horse_name==horse_name] <- NA
                race_result$farm[race_result$horse_name==horse_name] <- NA
                race_result$locality[race_result$horse_name==horse_name] <- NA
                race_result$horse_birthday[race_result$horse_name==horse_name] <- NA
                race_result$father[race_result$horse_name==horse_name] <- NA
                race_result$mother[race_result$horse_name==horse_name] <- NA
                }
            )
          }
          
          race_result$colour <- str_replace_all(race_result$colour,"毛色：","")
          race_result$owner <- str_replace_all(race_result$owner,"馬主：","")
          race_result$farm <- str_replace_all(race_result$farm,"生産者：","")
          race_result$locality <- str_replace_all(race_result$locality,"産地：","")
          #race_result$horse_birthday <- str_replace_all(race_result$horse_birthday,"年","/")
          #race_result$horse_birthday <- str_replace_all(race_result$horse_birthday,"月","/")
          #race_result$horse_birthday <- as.Date(race_result$horse_birthday)
          
          race_result <- dplyr::arrange(race_result,horse_number) # 馬番順に並べる

次にそのレースまでに獲得した賞金額を落としに行きます。これはレース結果のページの出馬表と書かれたリンクをたどるとアクセスできます。ここに賞金があるのでそれを取得します。

          yosou_url <- race1 %>% html_nodes("a") %>% html_attr("href") 
          yosou_url <- yosou_url[str_detect(yosou_url, pattern="denma")==1]
          
          if (length(yosou_url)==1){
          yosou1 <-  read_html(str_c("https://keiba.yahoo.co.jp",yosou_url)) 
          Sys.sleep(2)
          yosou <- yosou1 %>% html_nodes(xpath = "//td[@class = 'txC']") %>% as.character()
          prize <- yosou[grepl("万",yosou)==TRUE] %>% str_extract_all("\\d+万")
          prize <- t(do.call("data.frame",prize)) %>% as.character()
          race_result$prize <- prize
          race_result$prize <- str_replace_all(race_result$prize,"万","") %>% as.numeric()
          } else race_result$prize <- NA

取得した各レース結果を格納するdatasetというデータフレームを作成し、データを格納していきます。1年ごとにそれをSQLiteへ保存していきます。

          ## ファイル貯めるのかく
          if (k == 1 && i == 1 && j == 1){
            dataset <- race_result
          } else {
            dataset <- rbind(dataset,race_result)
          } # if文2の終わり
        }else
        {
          print("保存できませんでした") 
        }# if文1の終わり
      } # iループの終わり
    } # jループ終わり
  } # kループの終わり
  beep(3)
  write.csv(dataset,"race_result2.csv", row.names = FALSE)
  
  if (year == 1994){
    dbWriteTable(con, "race_result", dataset)
  } else {
    dbWriteTable(con, "temp", dataset)
    dbSendQuery(con, "INSERT INTO race_result select * from temp")
    dbSendQuery(con, "DROP TABLE temp")
  } # ifの終わり
} # yearループの終わり
end.time <- Sys.time()
print(str_c("処理時間は",end.time-start.time,"です。"))
beep(5)

options(warn = 1)

dbDisconnect(con)

以上です。結果はまた後程。