ray88’s diary

お仕事で困ったとき用の自分用の覚書

IEを使用せずにHTMLドキュメントを取得する

■下準備:参照設定しておくこと

■サンプルコード Basic認証なしの場合

Sub HTTPリクエスト()

Dim httpReq As XMLHTTP60
Set httpReq = New XMLHTTP60

httpReq.Open "GET", "https://www.au.com/mobile/area/5g/station/"
httpReq.Send

'Do While httpReq.readyState < 4
'    DoEvents
'Loop

Do While (httpReq.readyState < 4) And (httpReq.Status <> 200)
    DoEvents
Loop

Dim htmlDoc As Object
Set htmlDoc = New HTMLDocument

htmlDoc.write httpReq.responseText

MsgBox "スクレイピング開始"

Dim elementLength As Long

elementLength = htmlDoc.all.Length - 1

Dim wb As Workbook
Dim strPath As String

strPath = "C:\Users\cryst\OneDrive\デスクトップ\ExcelVBAプロジェクト\webスクレイピング\test.xlsx"

'Set wb = Workbooks.Add
'
'wb.SaveAs strPath

Dim i As Long
Dim stationName As String

'For i = 1023 To htmlDoc.all.Length - 1
'For i = 472 To 2000

For i = 472 To elementLength

'    Debug.Print htmlDoc.all(i).tagname
'    If i = 1023 Then
'        Stop
'    End If
    If htmlDoc.all(i).tagname = "H3" Then
        Debug.Print i & "番" & htmlDoc.all(i).innerText
    End If
'    If htmlDoc.all(i).tagname = "TR" And htmlDoc.all(i).tagname <> "" Then
'        Debug.Print i & "番" & htmlDoc.all(i).innerText
'        Debug.Print htmlDoc.all(i).className
'    End If

    If htmlDoc.all(i).tagname = "TD" And htmlDoc.all(i).tagname <> "" Then
        stationName = htmlDoc.all(i).innerText
        Debug.Print i & "番" & htmlDoc.all(i).innerText
        'Debug.Print htmlDoc.all(i).className
    End If
    If htmlDoc.all(i).tagname = "P" Then
        Debug.Print i & "番" & htmlDoc.all(i).className
    End If
'    If htmlDoc.all(i).innerText = "改札付" Then
'        Debug.Print i & "番" & htmlDoc.all(i).innerText
'    End If
    If htmlDoc.all(i).tagname = "SPAN" Then
        Debug.Print i & "番" & htmlDoc.all(i).className
    End If
'    If InStr(htmlDoc.all(i).innerText, "route") > 0 Or InStr(htmlDoc.all(i).innerText, "map") > 0 Then
'        Debug.Print htmlDoc.all(i).tagname
'        Debug.Print htmlDoc.all(i).className
'    End If
Next

'Debug.Print htmlDoc.Title
'Debug.Print htmlDoc.all(1023).innerText
'Debug.Print htmlDoc.all(1023).tagname
'Debug.Print htmlDoc.getElementsByTagName("TR")(0).innerText

Set htmlDoc = Nothing
Set httpReq = Nothing

MsgBox "スクレイピング完了"
End Sub

■サンプルコードBasic認証ありの場合

 ※ページ一番下にあるコード「 Function EncodeBase64Str」でログインIDとPWをBase64エンコードする必要がある

Sub Basic認証ありHTTPリクエスト()
'---------------------------------------------------
'機能:Basic認証ありのHPよりWebスクレイピングする
'参照設定:以下を参照設定すること
'     Microsoft XML v6.0
'     Microsoft HTML Object Library
'--------------------------------------------------
Dim httpReq As XMLHTTP60
Set httpReq = New XMLHTTP60
'------------------------------------------------
'★URLとログインIDとログインPWを設定
Dim URL As String
Dim UserName As String
Dim Password As String

URL = "https://www.test.com/"
UserName = "A12345"
Password = "abc123"

'★URLを送信
httpReq.Open "GET", URL, False
'-----------------------------------------------------
'★Authorizationヘッダーでユーザー名とパスワード送信
  httpReq.setRequestHeader "Authorization", "Basic " & EncodeBase64Str(UserName & ":" & Password)
   
  'キャッシュ対策
  httpReq.setRequestHeader "Pragma", "no-cache"
  httpReq.setRequestHeader "Cache-Control", "no-cache"
  httpReq.setRequestHeader "If-Modified-Since", "Thu, 01 Jun 1970 00:00:00 GMT"
   
'★HTTPリクエストをSend
  httpReq.Send
'----------------------------------------------------
'★HTTPリクエストが完了するまでループ

Do While (httpReq.readyState < 4) And (httpReq.Status <> 200)
    DoEvents
Loop
'----------------------------------------------------
'★レスポンスのあったテキストをHTMLドキュメント型に変換
Dim htmlDoc As Object
Set htmlDoc = New HTMLDocument

htmlDoc.write httpReq.responseText

'HTMLドキュメント型に変換した文字列の書込み速度が間に合わないため
'メッセージBoxを挟むことで速度調整
MsgBox "スクレイピング開始"

Dim elementLength As Long

elementLength = htmlDoc.all.Length - 1

Dim wb As Workbook
Dim strPath As String

strPath = "C:\Users\ユーザ名\デスクトップ\test.xlsx"

'Set wb = Workbooks.Add
'
'wb.SaveAs strPath

Dim i As Long
Dim stationName As String

'For i = 1023 To htmlDoc.all.Length - 1
'For i = 472 To 2000

For i = 472 To elementLength

'    Debug.Print htmlDoc.all(i).tagname
'    If i = 1023 Then
'        Stop
'    End If
    If htmlDoc.all(i).tagname = "H3" Then
        Debug.Print i & "番" & htmlDoc.all(i).innerText
    End If
'    If htmlDoc.all(i).tagname = "TR" And htmlDoc.all(i).tagname <> "" Then
'        Debug.Print i & "番" & htmlDoc.all(i).innerText
'        Debug.Print htmlDoc.all(i).className
'    End If

    If htmlDoc.all(i).tagname = "TD" And htmlDoc.all(i).tagname <> "" Then
        stationName = htmlDoc.all(i).innerText
        Debug.Print i & "番" & htmlDoc.all(i).innerText
        'Debug.Print htmlDoc.all(i).className
    End If
    If htmlDoc.all(i).tagname = "P" Then
        Debug.Print i & "番" & htmlDoc.all(i).className
    End If
'    If htmlDoc.all(i).innerText = "改札付" Then
'        Debug.Print i & "番" & htmlDoc.all(i).innerText
'    End If
    If htmlDoc.all(i).tagname = "SPAN" Then
        Debug.Print i & "番" & htmlDoc.all(i).className
    End If
'    If InStr(htmlDoc.all(i).innerText, "route") > 0 Or InStr(htmlDoc.all(i).innerText, "map") > 0 Then
'        Debug.Print htmlDoc.all(i).tagname
'        Debug.Print htmlDoc.all(i).className
'    End If
Next

'Debug.Print htmlDoc.Title
'Debug.Print htmlDoc.all(1023).innerText
'Debug.Print htmlDoc.all(1023).tagname
'Debug.Print htmlDoc.getElementsByTagName("TR")(0).innerText

Set htmlDoc = Nothing
Set httpReq = Nothing
MsgBox "スクレイピング完了"
End Sub
Private Function EncodeBase64Str(ByVal str As String) As String
'文字列をBase64エンコード
  Dim ret As String
  Dim d() As Byte
  Const adTypeBinary = 1
  Const adTypeText = 2
   
  ret = "" '初期化
  On Error Resume Next
  With CreateObject("ADODB.Stream")
    .Open
    .Type = adTypeText
    .Charset = "UTF-8"
    .WriteText str
    .Position = 0
    .Type = adTypeBinary
    .Position = 3
    d = .Read()
    .Close
  End With
  With CreateObject("MSXML2.DOMDocument").createElement("base64")
    .DataType = "bin.base64"
    .nodeTypedValue = d
    ret = .Text
  End With
  On Error GoTo 0
  EncodeBase64Str = ret
End Function

■参考URL
エクセルVBAでIEを使わずにHTMLドキュメントを取得する方法
エクセルVBAでHTTPリクエストをする最も簡単なプログラム

Option Explicit
Private Declare PtrSafe Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long)
Private Function EncodeBase64Str(ByVal str As String) As String
'文字列をBase64エンコード
  Dim ret As String
  Dim d() As Byte
  Const adTypeBinary = 1
  Const adTypeText = 2
   
  ret = "" '初期化
  On Error Resume Next
  With CreateObject("ADODB.Stream")
    .Open
    .Type = adTypeText
    .Charset = "UTF-8"
    .WriteText str
    .Position = 0
    .Type = adTypeBinary
    .Position = 3
    d = .Read()
    .Close
  End With
  With CreateObject("MSXML2.DOMDocument").createElement("base64")
    .DataType = "bin.base64"
    .nodeTypedValue = d
    ret = .Text
  End With
  On Error GoTo 0
  EncodeBase64Str = ret
End Function
Sub HTTPリクエスト()

Dim httpReq As XMLHTTP60
Set httpReq = New XMLHTTP60

Dim URL As String
Dim UserName As String
Dim Password As String

URL = ""
UserName = ""
Password = ""

httpReq.Open "GET", URL, False
'-----------------------------------------------------
'★Authorizationヘッダーでユーザー名とパスワード送信
  httpReq.setRequestHeader "Authorization", "Basic " & EncodeBase64Str(UserName & ":" & Password)
   
  'キャッシュ対策
  httpReq.setRequestHeader "Pragma", "no-cache"
  httpReq.setRequestHeader "Cache-Control", "no-cache"
  httpReq.setRequestHeader "If-Modified-Since", "Thu, 01 Jun 1970 00:00:00 GMT"
   
  httpReq.Send
'----------------------------------------------------


'httpReq.Open "GET", "https://www.au.com/mobile/area/5g/station/"
'httpReq.Send

'Do While httpReq.readyState < 4
'    DoEvents
'Loop

Do While (httpReq.readyState < 4) And (httpReq.Status <> 200)
    DoEvents
Loop

Dim htmlDoc As Object
Set htmlDoc = New HTMLDocument

htmlDoc.write httpReq.responseText

MsgBox "スクレイピング開始"

Dim elementLength As Long

elementLength = htmlDoc.all.Length - 1

Dim wb As Workbook
Dim strPath As String

strPath = "C:\Users\cryst\OneDrive\デスクトップ\ExcelVBAプロジェクト\webスクレイピング\test.xlsx"

'Set wb = Workbooks.Add
'
'wb.SaveAs strPath

Dim i As Long
Dim stationName As String

'For i = 1023 To htmlDoc.all.Length - 1
'For i = 472 To 2000

For i = 472 To elementLength

'    Debug.Print htmlDoc.all(i).tagname
'    If i = 1023 Then
'        Stop
'    End If
    If htmlDoc.all(i).tagname = "H3" Then
        Debug.Print i & "番" & htmlDoc.all(i).innerText
    End If
'    If htmlDoc.all(i).tagname = "TR" And htmlDoc.all(i).tagname <> "" Then
'        Debug.Print i & "番" & htmlDoc.all(i).innerText
'        Debug.Print htmlDoc.all(i).className
'    End If

    If htmlDoc.all(i).tagname = "TD" And htmlDoc.all(i).tagname <> "" Then
        stationName = htmlDoc.all(i).innerText
        Debug.Print i & "番" & htmlDoc.all(i).innerText
        'Debug.Print htmlDoc.all(i).className
    End If
    If htmlDoc.all(i).tagname = "P" Then
        Debug.Print i & "番" & htmlDoc.all(i).className
    End If
'    If htmlDoc.all(i).innerText = "改札付" Then
'        Debug.Print i & "番" & htmlDoc.all(i).innerText
'    End If
    If htmlDoc.all(i).tagname = "SPAN" Then
        Debug.Print i & "番" & htmlDoc.all(i).className
    End If
'    If InStr(htmlDoc.all(i).innerText, "route") > 0 Or InStr(htmlDoc.all(i).innerText, "map") > 0 Then
'        Debug.Print htmlDoc.all(i).tagname
'        Debug.Print htmlDoc.all(i).className
'    End If
Next

'Debug.Print htmlDoc.Title
'Debug.Print htmlDoc.all(1023).innerText
'Debug.Print htmlDoc.all(1023).tagname
'Debug.Print htmlDoc.getElementsByTagName("TR")(0).innerText

Set htmlDoc = Nothing
Set httpReq = Nothing

MsgBox "スクレイピング完了"


End Sub
Sub HTTPリクエスト()

Dim httpReq As XMLHTTP60
Set httpReq = New XMLHTTP60

httpReq.Open "GET", "https://www.au.com/mobile/area/5g/station/"
httpReq.Send

Do While httpReq.readyState < 4
    DoEvents
Loop

Dim htmlDoc As Object
Set htmlDoc = New HTMLDocument

htmlDoc.write httpReq.responseText

Dim i As Long

'For i = 1023 To htmlDoc.all.Length - 1
For i = 472 To 2000
'    Debug.Print htmlDoc.all(i).tagname
'    If i = 1023 Then
'        Stop
'    End If
    If htmlDoc.all(i).tagname = "H3" Then
        Debug.Print i & "番" & htmlDoc.all(i).innerText
    End If
'    If htmlDoc.all(i).tagname = "TR" And htmlDoc.all(i).tagname <> "" Then
'        Debug.Print i & "番" & htmlDoc.all(i).innerText
'        Debug.Print htmlDoc.all(i).className
'    End If

    If htmlDoc.all(i).tagname = "TD" And htmlDoc.all(i).tagname <> "" Then
        Debug.Print i & "番" & htmlDoc.all(i).innerText
        'Debug.Print htmlDoc.all(i).className
    End If
    If htmlDoc.all(i).tagname = "P" Then
        Debug.Print i & "番" & htmlDoc.all(i).className
    End If
    If htmlDoc.all(i).innerText = "改札付" Then
        Debug.Print i & "番" & htmlDoc.all(i).innerText
    End If
    If htmlDoc.all(i).tagname = "SPAN" Then
        Debug.Print i & "番" & htmlDoc.all(i).className
    End If
'    If InStr(htmlDoc.all(i).innerText, "route") > 0 Or InStr(htmlDoc.all(i).innerText, "map") > 0 Then
'        Debug.Print htmlDoc.all(i).tagname
'        Debug.Print htmlDoc.all(i).className
'    End If
Next

'Debug.Print htmlDoc.Title
'Debug.Print htmlDoc.all(1023).innerText
'Debug.Print htmlDoc.all(1023).tagname
'Debug.Print htmlDoc.getElementsByTagName("TR")(0).innerText

Set htmlDoc = Nothing
Set httpReq = Nothing

End Sub

■参考URL
エクセルVBAでIEを使わずにHTMLドキュメントを取得する方法
エクセルVBAでHTTPリクエストをする最も簡単なプログラム