Class: Birdwatcher::Modules::Statuses::Wordlist

Inherits:
Birdwatcher::Module show all
Defined in:
lib/birdwatcher/modules/statuses/word_list.rb

Constant Summary

Constants inherited from Birdwatcher::Module

Birdwatcher::Module::MODULE_PATH

Constants included from Concerns::Concurrency

Concerns::Concurrency::DEFAULT_THREAD_POOL_SIZE

Constants included from Concerns::Core

Concerns::Core::DATA_DIRECTORY

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Birdwatcher::Module

_file_path, _file_path=, descendants, #execute, inherited, meta, meta=, module_by_path, module_paths, modules, path

Methods included from Concerns::WordList

included, #make_word_list

Methods included from Concerns::Concurrency

included, #thread_pool

Methods included from Concerns::Persistence

included, #save_status, #save_user

Methods included from Concerns::Presentation

included, #make_status_summary_output, #make_url_summary_output, #make_user_details_output, #make_user_summary_output, #output_status_summary, #output_user_details, #output_user_summary, #page_text

Methods included from Concerns::Outputting

#confirm, #error, #fatal, included, #info, #line_separator, #newline, #output, #output_formatted, #task, #warn

Methods included from Concerns::Util

#escape_html, #excerpt, included, #number_to_human_size, #parse_time, #pluralize, #strip_control_characters, #strip_html, #suppress_output, #suppress_warnings, #time_ago_in_words, #unescape_html

Methods included from Concerns::Core

#console, #current_workspace, #current_workspace=, #database, included, #klout_client, #read_data_file, #twitter_client

Class Method Details

.infoObject



79
80
81
82
83
84
85
86
87
# File 'lib/birdwatcher/modules/statuses/word_list.rb', line 79

def self.info
<<-INFO
The Word List module can generate a simple word list or dictionary from words
used in statuses across all or specific users.

Since users Tweet about their hobbies, interests, work, etc. generating a word
list from statuses can be very effective for password cracking.
INFO
end

Instance Method Details

#runObject



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/birdwatcher/modules/statuses/word_list.rb', line 89

def run
  if option_setting("USERS")
    screen_names = option_setting("USERS").split(" ").map(&:strip)
    user_ids     = current_workspace.users_dataset.where("screen_name IN ?", screen_names).map(&:id)
    statuses     = current_workspace.statuses_dataset.where("user_id IN ?", user_ids)
  else
    statuses = current_workspace.statuses_dataset
  end
  if statuses.count.zero?
    error("There are no statuses to process")
    return false
  end
  word_list = make_word_list(
    :min_word_count       => option_setting("MIN_WORD_COUNT"),
    :min_word_length      => option_setting("MIN_WORD_LENGTH"),
    :exclude_words        => option_setting("EXCLUDE_WORDS").to_s.split(" ").map(&:strip),
    :exclude_stopwords    => option_setting("EXCLUDE_STOPWORDS"),
    :exclude_common_words => option_setting("EXCLUDE_COMMON"),
    :exclude_hashtags     => option_setting("EXCLUDE_HASHTAGS"),
    :exclude_mentions     => option_setting("EXCLUDE_MENTIONS"),
    :word_cap             => option_setting("WORD_CAP"),
    :stopwords_file       => File.join(DATA_DIRECTORY, "english_stopwords.txt"),
    :common_words_file    => File.join(DATA_DIRECTORY, "top100Kenglishwords.txt")
  )
  task("Processing #{statuses.count.to_s.bold} statuses...") do
    statuses.each do |status|
      word_list.add_to_corpus(status.text)
      if option_setting("INCLUDE_PAGE_TITLES")
        status.urls_dataset
          .where("title IS NOT NULL")
          .where("final_url NOT LIKE 'https://twitter.com/%'")
          .map(&:title).each do |page_title|
          word_list.add_to_corpus(page_title)
        end
      end
    end
    word_list.process
  end
  task("Writing #{pluralize(word_list.word_list.length, 'word', 'words')} to file...") do
    File.open(option_setting("DEST"), "w") do |f|
      word_list.word_list.each do |word_and_count|
        word, count = word_and_count
        if option_setting("INCLUDE_COUNT")
          f.puts("#{word}, #{count}")
        else
          f.puts(word)
        end
      end
    end
  end
  file_size = number_to_human_size(File.size(option_setting("DEST")))
  info("Wrote #{file_size.bold} to #{option_setting('DEST').bold}")
end