Files: 8587a7ef0afc0ac2f7e6bf18c7f56c9b2d4eeec6 / ggscrape
4798 bytesRaw
1 | |
2 | # vi: ts=4 sw=4 et |
3 | # |
4 | # ggscrape(1) |
5 | # Download emails from a Google Groups |
6 | # |
7 | # Copyright (c) 2014 Charles Lehner |
8 | # Released under the terms of the MIT License. |
9 | |
10 | VERSION=0.0.1 |
11 | BASE_URL='https://groups.google.com/forum/' |
12 | |
13 | EX_USAGE=64 |
14 | topic_range=100 |
15 | |
16 | opt_verbose= |
17 | group_id= |
18 | dest_dir= |
19 | ln_dir= |
20 | cookie_str= |
21 | |
22 | if [[ -n "$GG_COOKIE" ]]; then |
23 | cookie_str="$GG_COOKIE" |
24 | fi |
25 | |
26 | debug_print() { |
27 | if [[ -n $opt_verbose ]]; then |
28 | echo $@ >&2 |
29 | fi |
30 | } |
31 | |
32 | req() { |
33 | debug_print req: "$1" |
34 | curl -sN -b "$cookie_str" "$BASE_URL$@" |
35 | } |
36 | |
37 | req_fragment() { |
38 | req "?_escaped_fragment_=$@" |
39 | } |
40 | |
41 | test_permission() { |
42 | debug_print testing permission |
43 | req_fragment "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK' |
44 | } |
45 | |
46 | get_topics_single() { |
47 | local start=$1 |
48 | local end=$2 |
49 | |
50 | debug_print "get topics $group_id [$start-$end]" |
51 | |
52 | req_fragment "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \ |
53 | "s/^<i>Showing [^<]* of 0 topics<\/i>$//p; |
54 | s/<tr>/\0\n/; /lastPostDate/ { |
55 | s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D; |
56 | }; |
57 | /subject/{ |
58 | s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p; |
59 | }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/' |
60 | # return status 99 if no matches found |
61 | } |
62 | |
63 | unknown_option() { |
64 | echo "Unknown option $1" >&2 |
65 | exit $EX_USAGE |
66 | } |
67 | |
68 | get_topics() { |
69 | local start=$(($1)) |
70 | local end=$(($2)) |
71 | local temp_end |
72 | |
73 | if ((start==0)); then |
74 | start=1 |
75 | fi |
76 | |
77 | debug_print "get all topics $group_id [$start-$end]" |
78 | |
79 | for ((i = start; i <= end || end == 0; i += topic_range)); do |
80 | ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1)) |
81 | get_topics_single $i $temp_end || break |
82 | done |
83 | } |
84 | |
85 | # get message ids in a topic |
86 | get_messages() { |
87 | local topic_id="$1" |
88 | req_fragment "topic/${group_id}/${topic_id}" | sed -n\ |
89 | 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p' |
90 | } |
91 | |
92 | download_message() { |
93 | local topic_id="$1" |
94 | local msg_id="$2" |
95 | debug_print download topic $topic_id message $msg_id |
96 | path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml" |
97 | if [[ -s "$path" ]]; then |
98 | echo "message ${topic_id}${msg_id} already downloaded. skipping." |
99 | else |
100 | echo "message ${topic_id}${msg_id} downloading." |
101 | temp=$(mktemp) |
102 | if req "message/raw?msg=${group_id}/${topic_id}/${msg_id}" -o "$temp" |
103 | then |
104 | mv "$temp" "$path" |
105 | if [[ -n "$ln_dir" ]]; then |
106 | ln "$path" "$ln_dir" |
107 | fi |
108 | else |
109 | echo "message ${topic_id}${msg_id} failed to download." >&2 |
110 | rm "$temp" |
111 | fi |
112 | fi |
113 | } |
114 | |
115 | download_messages() { |
116 | local topic_id="$1" |
117 | debug_print download topic $topic_id |
118 | get_messages "$topic_id" | while read msg_id _; do |
119 | download_message "$topic_id" "$msg_id" |
120 | done |
121 | } |
122 | |
123 | download_mails() { |
124 | local start=$1 |
125 | local end=$2 |
126 | |
127 | get_topics "$start" "$end" | while read topic_id topic_title; do |
128 | download_messages $topic_id |
129 | done |
130 | } |
131 | |
132 | show_version() { |
133 | echo ggscrape $VERSION |
134 | } |
135 | |
136 | show_help() { |
137 | cat << EOF |
138 | ggscrape. Download emails from a Google Group |
139 | |
140 | Usage: |
141 | ggscrape <group_id> [test] |
142 | ggscrape <group_id> topics |
143 | ggscrape <group_id> messages <topic_id> |
144 | ggscrape <group_id> download <dest_dir> |
145 | |
146 | Options: |
147 | -h, --help Show this screen |
148 | --version Show version |
149 | -v, --verbose Show debug info |
150 | -c, --cookie <cookie> Use the given cookie string |
151 | -b, --begin <topicnum> Topic number at which to begin downloading |
152 | -e, --end <topicnum> Topic number at which to stop downloading |
153 | -l, --ln <ln_dir> Hard link email files into this directory |
154 | |
155 | Environmental variables: |
156 | |
157 | GG_COOKIE use as value for --cookie |
158 | |
159 | EOF |
160 | } |
161 | |
162 | cmd= |
163 | topic_id= |
164 | topic_begin= |
165 | topic_end= |
166 | |
167 | while [[ "$#" -gt 0 ]]; do |
168 | case "$1" in |
169 | -h|--help) show_help; exit;; |
170 | --version) show_version; exit;; |
171 | -v|--verbose) opt_verbose=1;; |
172 | -c|--cookie) cookie_str="$2"; shift;; |
173 | -b|--begin) topic_begin="$2"; shift;; |
174 | -e|--end) topic_end="$2"; shift;; |
175 | -l|--ln) ln_dir="$2"; shift;; |
176 | topics) cmd=topics;; |
177 | test) cmd=test;; |
178 | messages) cmd=messages; topic_id="$2"; shift;; |
179 | download) cmd=download; dest_dir="$2"; shift;; |
180 | *) if [[ -z "$group_id" ]]; then |
181 | group_id="$1" |
182 | else |
183 | unknown_option "$1" |
184 | fi;; |
185 | esac |
186 | shift |
187 | done |
188 | |
189 | if [[ -z "$group_id" ]]; then |
190 | show_help |
191 | exit 1 |
192 | fi |
193 | |
194 | case "$cmd" in |
195 | '') show_help; exit;; |
196 | test) test_permission;; |
197 | topics) get_topics "$topic_begin" "$topic_end";; |
198 | download) download_mails "$topic_begin" "$topic_end";; |
199 | messages) get_messages "$topic_id";; |
200 | *) echo "Unknown command $cmd" >&2;; |
201 | esac |
202 |
Built with git-ssb-web