git ssb

0+

cel / ggscrape



Tree: 85166133f139a08a92d9d4a72ec96b266d9efbd1

Files: 85166133f139a08a92d9d4a72ec96b266d9efbd1 / ggscrape

4929 bytesRaw
1#!/bin/bash
2# vi: ts=4 sw=4 et
3#
4# ggscrape(1)
5# Download emails from a Google Groups
6#
7# Copyright (c) 2014 Charles Lehner
8# Released under the terms of the MIT License.
9
10VERSION=1.0.0
11BASE_URL='https://groups.google.com/forum/'
12
13EX_USAGE=64
14topic_range=100
15
16opt_verbose=
17group_id=
18dest_dir=
19ln_dir=
20cookie_str=
21
22if [[ -n "$GG_COOKIE" ]]; then
23 cookie_str="$GG_COOKIE"
24fi
25
26debug_print() {
27 if [[ -n $opt_verbose ]]; then
28 echo $@ >&2
29 fi
30}
31
32req() {
33 debug_print req: "$1"
34 curl -sN -b "$cookie_str" "$BASE_URL$@"
35}
36
37req_fragment() {
38 req "?_escaped_fragment_=$@"
39}
40
41check_permission() {
42 debug_print testing permission
43 if ! req_fragment "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK'
44 then
45 echo 'Unable to access group.' >&2
46 return 1
47 fi
48}
49
50get_topics_single() {
51 local start=$1
52 local end=$2
53
54 debug_print "get topics $group_id [$start-$end]"
55
56 req_fragment "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \
57 "s/^<i>Showing [^<]* of 0 topics<\/i>$//p;
58 s/<tr>/\0\n/; /lastPostDate/ {
59 s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D;
60 };
61 /subject/{
62 s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p;
63 }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/'
64 # return status 99 if no matches found
65}
66
67unknown_option() {
68 echo "Unknown option $1" >&2
69 exit $EX_USAGE
70}
71
72get_topics() {
73 local start=$(($1))
74 local end=$(($2))
75 local temp_end
76
77 if ((start==0)); then
78 start=1
79 fi
80
81 check_permission || return 1
82
83 debug_print "get all topics $group_id [$start-$end]"
84
85 for ((i = start; i <= end || end == 0; i += topic_range)); do
86 ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1))
87 get_topics_single $i $temp_end || break
88 done
89}
90
91# get message ids in a topic
92get_messages() {
93 local topic_id="$1"
94 req_fragment "topic/${group_id}/${topic_id}" | sed -n\
95 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p'
96}
97
98download_message() {
99 local topic_id="$1"
100 local msg_id="$2"
101 debug_print download topic $topic_id message $msg_id
102 path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml"
103 if [[ -s "$path" ]]; then
104 echo "message ${topic_id}${msg_id} already downloaded. skipping."
105 else
106 echo "message ${topic_id}${msg_id} downloading."
107 temp=$(mktemp)
108 if req "message/raw?msg=${group_id}/${topic_id}/${msg_id}" -o "$temp"
109 then
110 mv "$temp" "$path"
111 if [[ -n "$ln_dir" ]]; then
112 ln "$path" "$ln_dir"
113 fi
114 else
115 echo "message ${topic_id}${msg_id} failed to download." >&2
116 rm "$temp"
117 fi
118 fi
119}
120
121download_messages() {
122 local topic_id="$1"
123 debug_print download topic $topic_id
124 get_messages "$topic_id" | while read msg_id _; do
125 download_message "$topic_id" "$msg_id"
126 done
127}
128
129download_mails() {
130 local start=$1
131 local end=$2
132
133 get_topics "$start" "$end" | while read topic_id topic_title; do
134 download_messages $topic_id
135 done
136}
137
138show_version() {
139 echo ggscrape $VERSION
140}
141
142show_help() {
143 cat << EOF
144ggscrape. Download emails from a Google Group
145
146Usage:
147 ggscrape <group_id> test
148 ggscrape <group_id> topics
149 ggscrape <group_id> messages <topic_id>
150 ggscrape <group_id> download <dest_dir>
151
152Options:
153 -h, --help Show this screen
154 --version Show version
155 -v, --verbose Show debug info
156 -c, --cookie <cookie> Use the given cookie string
157 -b, --begin <topicnum> Topic number at which to begin downloading
158 -e, --end <topicnum> Topic number at which to stop downloading
159 -l, --ln <ln_dir> Hard link email files into this directory
160
161Environmental variables:
162
163 GG_COOKIE use as value for --cookie
164
165EOF
166}
167
168cmd=
169topic_id=
170topic_begin=
171topic_end=
172
173while [[ "$#" -gt 0 ]]; do
174 case "$1" in
175 -h|--help) show_help; exit;;
176 --version) show_version; exit;;
177 -v|--verbose) opt_verbose=1;;
178 -c|--cookie) cookie_str="$2"; shift;;
179 -b|--begin) topic_begin="$2"; shift;;
180 -e|--end) topic_end="$2"; shift;;
181 -l|--ln) ln_dir="$2"; shift;;
182 topics) cmd=topics;;
183 test) cmd=test;;
184 messages) cmd=messages; topic_id="$2"; shift;;
185 download) cmd=download; dest_dir="$2"; shift;;
186 *) if [[ -z "$group_id" ]]; then
187 group_id="$1"
188 else
189 unknown_option "$1"
190 fi;;
191 esac
192 shift
193done
194
195if [[ -z "$group_id" ]]; then
196 show_help
197 exit 1
198fi
199
200case "$cmd" in
201 '') show_help; exit;;
202 test) check_permission && echo Success;;
203 topics) get_topics "$topic_begin" "$topic_end";;
204 download) download_mails "$topic_begin" "$topic_end";;
205 messages) get_messages "$topic_id";;
206 *) echo "Unknown command $cmd" >&2;;
207esac
208

Built with git-ssb-web