Class: YoutubeCaptions

Inherits:

Object

Object
YoutubeCaptions

show all

Includes:: HTTParty

Defined in:: lib/youtube-captions.rb

Constant Summary collapse

TRANSLATABLE_REGEX =

/({"captionTracks":.*"isTranslatable":(true|false)}\])/

Instance Attribute Summary collapse

#id ⇒ Object readonly

Returns the value of attribute id.
#lang ⇒ Object readonly

Returns the value of attribute lang.

Instance Method Summary collapse

#call ⇒ Object
#initialize(id:, lang: nil) ⇒ YoutubeCaptions constructor

A new instance of YoutubeCaptions.

Constructor Details

#initialize(id:, lang: nil) ⇒ `YoutubeCaptions`

Returns a new instance of YoutubeCaptions.

# File 'lib/youtube-captions.rb', line 8

def initialize(id:, lang: nil)
  @id = id
  @lang = lang
end

Instance Attribute Details

#id ⇒ `Object` (readonly)

Returns the value of attribute id.



7
8
9

# File 'lib/youtube-captions.rb', line 7

def id
  @id
end

#lang ⇒ `Object` (readonly)

Returns the value of attribute lang.



7
8
9

# File 'lib/youtube-captions.rb', line 7

def lang
  @lang
end

Instance Method Details

#call ⇒ `Object`

Raises:

(StandardError)

# File 'lib/youtube-captions.rb', line 15

def call
  youtube_html = self.class.get("https://www.youtube.com/watch?v=#{id}")
  match_data = youtube_html.match(TRANSLATABLE_REGEX)
  raise StandardError.new("There are no captions") unless match_data

  caption_tracks = JSON.parse("#{match_data[1]}}")["captionTracks"]
  if lang.present?
    subtitle = caption_tracks.find {|json| json["vssId"] == ".#{lang}"} || caption_tracks.find {|json| json["vssId"] == "a.#{lang}"}
    if !subtitle || (subtitle && !subtitle["baseUrl"])
      raise ArgumentError.new("Lang no available")
    end
  else
    subtitle = caption_tracks.find {|json| json["vssId"] == ".en"} || caption_tracks.find {|json| json["vssId"] == "a.en"}
    subtitle = caption_tracks.first unless subtitle
  end

  transcript_html = self.class.get(subtitle["baseUrl"])
  transcript_tags = transcript_html.to_s.gsub('<?xml version="1.0" encoding="utf-8" ?><transcript>', '').gsub('</transcript>', '').split('</text>').select {|line| line && line.strip}

  transcript_parts = transcript_tags.map do |transcript_tag|
    encoded_transcript = transcript_tag.gsub(/<text.+>/, '')
                                       .gsub("/&amp;/gi", '&')
                                       .gsub("/<\/?[^>]+(>|$)/g", '')
                                       .gsub(/&amp;#(\d+);/) { [$1.to_i].pack('U*') }
    CGI.unescapeHTML(encoded_transcript)
  end

  transcript_parts.join(" ")
end

Class: YoutubeCaptions

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(id:, lang: nil) ⇒ YoutubeCaptions

Instance Attribute Details

#id ⇒ Object (readonly)

#lang ⇒ Object (readonly)

Instance Method Details

#call ⇒ Object

#initialize(id:, lang: nil) ⇒ `YoutubeCaptions`

#id ⇒ `Object` (readonly)

#lang ⇒ `Object` (readonly)

#call ⇒ `Object`