Class: YoutubeTranscript2020

Inherits:
Object
  • Object
show all
Defined in:
lib/youtube_transcript2020.rb

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(id = nil, debug: false) ⇒ YoutubeTranscript2020

Returns a new instance of YoutubeTranscript2020.


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/youtube_transcript2020.rb', line 17

def initialize(id=nil, debug: false)

  return unless id

  @debug = debug

  @id = id[/https?:\/\//] ? YoutubeID.from(id) : id

  # Fetching the transcript from the following statement no longer works.
  # Instead, copy and paste the transcript from the YouTube video page into
  # a text file and import it.
  #
  #s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
  #@s = parse(s) unless s.empty?

  fetch_info(@id)

end

Instance Attribute Details

#authorObject (readonly)

Returns the value of attribute author.


15
16
17
# File 'lib/youtube_transcript2020.rb', line 15

def author
  @author
end

#idObject (readonly)

Returns the value of attribute id.


15
16
17
# File 'lib/youtube_transcript2020.rb', line 15

def id
  @id
end

#titleObject (readonly)

Returns the value of attribute title.


15
16
17
# File 'lib/youtube_transcript2020.rb', line 15

def title
  @title
end

#to_aObject (readonly)

Returns the value of attribute to_a.


15
16
17
# File 'lib/youtube_transcript2020.rb', line 15

def to_a
  @to_a
end

Instance Method Details

#import(obj) ⇒ Object

reads a plain text transcript which has been modified to include headings


54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/youtube_transcript2020.rb', line 54

def import(obj)

  s = RXFReader.read(obj).first

  if s =~ /------+/ then

    header, body = s.split(/-----+/,2)

    h = SimpleConfig.new(header).to_h
    @id, @author, @title = h[:id], h[:author], h[:title]
    @s = body

  elsif File.extname(obj) == '.json'

    r = JSON.parse(s)
    @a = r.map {|x| [x['start'], x['text']]}
    @s = join_sentences(@a)

    return

  else

    body = obj
    raw_transcript = true

  end

  puts 'body: ' + body[0..400] if @debug
  a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
  @a = a[0].zip(a[1])

  @s = join_sentences(@a) if raw_transcript

end

#to_headingsObject

Outputs plain text containing the headings including timestamps note: This can be helpful for copyng and pasting directly into a YouTube comment


137
138
139
140
141
# File 'lib/youtube_transcript2020.rb', line 137

def to_headings()

  @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)

end

#to_htmlObject

Outputs HTML containing the embedded video and transcription


91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/youtube_transcript2020.rb', line 91

def to_html()

  url = 'https://www.youtube.com/embed/' + @id

  links = @a.map do |timestamp, s|

    seconds = Subunit.new(units={minutes:60, hours:60},
                timestamp.split(':').map(&:to_i)).to_i
    "<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
        % [url, seconds, timestamp, s]
  end

  puts '@html_embed: ' + @html_embed.inspect if @debug
  doc = Rexle.new(@html_embed.to_s)
  puts 'before attributes'
  doc.root.attributes[:name] = 'video'
  embed = doc.xml(declaration: false)
  puts 'embed: ' + embed.inspect if @debug
  #embed = @html_embed

"<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n  <title></title>\n  <meta charset=\"utf-8\" />\n</head>\n<body>\n<div style=\"width: 1080px; background: white\">\n<div style=\"float:left; width: 580px; background: white\">\n\#{embed}\n<h1>\#{@title}</h1>\n</div>\n<div style=\"float:right; width: 500px; overflow-y: scroll; height: 400px\">\n<ul>\#{links.join(\"\\n\")}</ul>\n</div>\n\n</div>\n</body>\n</html>\n"
end

#to_keywords(level: 2) ⇒ Object

returns a Hash object containing the frequenecy of each word level: 2 (ignores commond words including stop words) level: 3 (ignores dictionary words)


147
148
149
# File 'lib/youtube_transcript2020.rb', line 147

def to_keywords(level: 2)
  Yawc.new(self.to_text(), level: level).to_h
end

#to_sObject

returns the transcript in plain text including timestamps


42
43
44
45
46
# File 'lib/youtube_transcript2020.rb', line 42

def to_s()

  h = {id: @id, title: @title, author: @author}
  SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
end

#to_textObject


48
49
50
# File 'lib/youtube_transcript2020.rb', line 48

def to_text()
  @a.map(&:last).join("\n")
end