Files: 1f4de2eeab8c60b37f065c88bd87aa3ba6c51e0d / ggscrape
4273 bytesRaw
1 | |
2 | # vi: ts=4 sw=4 et |
3 | # |
4 | # ggscrape(1) |
5 | # Download emails from a Google Groups |
6 | # |
7 | # Copyright (c) 2014 Charles Lehner |
8 | # Released under the terms of the MIT License. |
9 | |
10 | VERSION=0.0.1 |
11 | BASE_URL='https://groups.google.com/forum/?_escaped_fragment_=' |
12 | |
13 | EX_USAGE=64 |
14 | topic_range=100 |
15 | |
16 | opt_verbose= |
17 | group_id= |
18 | dest_dir= |
19 | cookie_str= |
20 | |
21 | if [[ -n "$GG_COOKIE" ]]; then |
22 | cookie_str="$GG_COOKIE" |
23 | fi |
24 | |
25 | debug_print() { |
26 | if [[ -n $opt_verbose ]]; then |
27 | echo $@ >&2 |
28 | fi |
29 | } |
30 | |
31 | req() { |
32 | debug_print query: "$1" |
33 | curl -sN -b "$cookie_str" "$BASE_URL$@" |
34 | } |
35 | |
36 | test_permission() { |
37 | debug_print testing permission |
38 | req "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK' |
39 | } |
40 | |
41 | get_topics_single() { |
42 | local start=$1 |
43 | local end=$2 |
44 | |
45 | debug_print "get topics $group_id [$start-$end]" |
46 | |
47 | req "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \ |
48 | "s/^<i>Showing [^<]* of 0 topics<\/i>$//p; |
49 | s/<tr>/\0\n/; /lastPostDate/ { |
50 | s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D; |
51 | }; |
52 | /subject/{ |
53 | s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p; |
54 | }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/' |
55 | # return status 99 if no matches found |
56 | } |
57 | |
58 | unknown_option() { |
59 | echo "Unknown option $1" >&2 |
60 | exit $EX_USAGE |
61 | } |
62 | |
63 | get_topics() { |
64 | local start=$(($1)) |
65 | local end=$(($2)) |
66 | local temp_end |
67 | |
68 | if [[ -z "$start" ]]; then |
69 | start=1 |
70 | fi |
71 | |
72 | debug_print "get all topics $group_id [$start-$end]" |
73 | |
74 | for ((i = start; i <= end || end == 0; i += topic_range)); do |
75 | ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1)) |
76 | get_topics_single $i $temp_end || break |
77 | done |
78 | } |
79 | |
80 | # get message ids in a topic |
81 | get_messages() { |
82 | local topic_id="$1" |
83 | req "topic/${group_id}/${topic_id}" | sed -n\ |
84 | 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p' |
85 | } |
86 | |
87 | download_message() { |
88 | local topic_id="$1" |
89 | local msg_id="$2" |
90 | debug_print download topic $topic_id message $msg_id |
91 | path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml" |
92 | if [[ -s "$path" ]]; then |
93 | echo "file for message ${msg_id} already exists. skipping." |
94 | else |
95 | req "${group_id}/${topic_id}/${msg_id}" > "$path" |
96 | fi |
97 | } |
98 | |
99 | download_messages() { |
100 | local topic_id="$1" |
101 | debug_print download topic $topic_id |
102 | get_messages "$topic_id" | while read msg_id _; do |
103 | download_message "$topic_id" "$msg_id" |
104 | done |
105 | } |
106 | |
107 | download_mails() { |
108 | local start=$1 |
109 | local end=$2 |
110 | |
111 | get_topics "$start" "$end" | while read topic_id topic_title; do |
112 | download_messages $topic_id |
113 | done |
114 | } |
115 | |
116 | show_version() { |
117 | echo ggscrape $VERSION |
118 | } |
119 | |
120 | show_help() { |
121 | cat << EOF |
122 | ggscrape. Download emails from a Google Group |
123 | |
124 | Usage: |
125 | ggscrape <group_id> [test] |
126 | ggscrape <group_id> topics |
127 | ggscrape <group_id> messages <topic_id> |
128 | ggscrape <group_id> download <directory> |
129 | |
130 | Options: |
131 | -h, --help Show this screen |
132 | --version Show version |
133 | -v, --verbose Show debug info |
134 | -c, --cookie <cookie> Use the given cookie string |
135 | -b, --begin <topicnum> Topic number at which to begin downloading |
136 | -e, --end <topicnum> Topic number at which to stop downloading |
137 | |
138 | Environmental variables: |
139 | |
140 | GG_COOKIE use as value for --cookie |
141 | |
142 | EOF |
143 | } |
144 | |
145 | cmd= |
146 | topic_id= |
147 | topic_begin= |
148 | topic_end= |
149 | |
150 | while [[ "$#" -gt 0 ]]; do |
151 | case "$1" in |
152 | -h|--help) show_help; exit;; |
153 | --version) show_version; exit;; |
154 | -v|--verbose) opt_verbose=1;; |
155 | -c|--cookie) cookie_str="$2"; shift;; |
156 | -b|--begin) topic_begin="$2"; shift;; |
157 | -e|--end) topic_end="$2"; shift;; |
158 | topics) cmd=topics;; |
159 | test) cmd=test;; |
160 | messages) cmd=messages; topic_id="$2"; shift;; |
161 | download) cmd=download; dest_dir="$2"; shift;; |
162 | *) if [[ -z "$group_id" ]]; then |
163 | group_id="$1" |
164 | else |
165 | unknown_option "$1" |
166 | fi;; |
167 | esac |
168 | shift |
169 | done |
170 | |
171 | if [[ -z "$group_id" ]]; then |
172 | show_help |
173 | exit 1 |
174 | fi |
175 | |
176 | case "$cmd" in |
177 | '') show_help; exit;; |
178 | test) test_permission;; |
179 | topics) get_topics "$topic_begin" "$topic_end";; |
180 | download) download_mails "$topic_begin" "$topic_end";; |
181 | messages) get_messages "$topic_id";; |
182 | *) echo "Unknown command $cmd" >&2;; |
183 | esac |
184 |
Built with git-ssb-web