Class: YoutubeCaptions

Inherits:
Object
  • Object
show all
Includes:
HTTParty
Defined in:
lib/youtube-captions.rb

Constant Summary collapse

TRANSLATABLE_REGEX =
/({"captionTracks":.*"isTranslatable":(true|false)}\])/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(id:, lang: nil) ⇒ YoutubeCaptions

Returns a new instance of YoutubeCaptions.



8
9
10
11
# File 'lib/youtube-captions.rb', line 8

def initialize(id:, lang: nil)
  @id = id
  @lang = lang
end

Instance Attribute Details

#idObject (readonly)

Returns the value of attribute id.



7
8
9
# File 'lib/youtube-captions.rb', line 7

def id
  @id
end

#langObject (readonly)

Returns the value of attribute lang.



7
8
9
# File 'lib/youtube-captions.rb', line 7

def lang
  @lang
end

Instance Method Details

#callObject

Raises:

  • (StandardError)


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/youtube-captions.rb', line 15

def call
  youtube_html = self.class.get("https://www.youtube.com/watch?v=#{id}")
  match_data = youtube_html.match(TRANSLATABLE_REGEX)
  raise StandardError.new("There are no captions") unless match_data

  caption_tracks = JSON.parse("#{match_data[1]}}")["captionTracks"]
  if lang.present?
    subtitle = caption_tracks.find {|json| json["vssId"] == ".#{lang}"} || caption_tracks.find {|json| json["vssId"] == "a.#{lang}"}
    if !subtitle || (subtitle && !subtitle["baseUrl"])
      raise ArgumentError.new("Lang no available")
    end
  else
    subtitle = caption_tracks.find {|json| json["vssId"] == ".en"} || caption_tracks.find {|json| json["vssId"] == "a.en"}
    subtitle = caption_tracks.first unless subtitle
  end

  transcript_html = self.class.get(subtitle["baseUrl"])
  transcript_tags = transcript_html.to_s.gsub('<?xml version="1.0" encoding="utf-8" ?><transcript>', '').gsub('</transcript>', '').split('</text>').select {|line| line && line.strip}

  transcript_parts = transcript_tags.map do |transcript_tag|
    encoded_transcript = transcript_tag.gsub(/<text.+>/, '')
                                       .gsub("/&amp;/gi", '&')
                                       .gsub("/<\/?[^>]+(>|$)/g", '')
                                       .gsub(/&amp;#(\d+);/) { [$1.to_i].pack('U*') }
    CGI.unescapeHTML(encoded_transcript)
  end

  transcript_parts.join(" ")
end