Message ID | 20170224042835.12728-1-dja@axtens.net |
---|---|
State | Rejected |
Headers | show |
On Fri, 2017-02-24 at 15:28 +1100, Daniel Axtens wrote: > This brings it into line with the other import commands. > > It's also an order of magnitude or so faster. > > Signed-off-by: Daniel Axtens <dja@axtens.net> I never got the difference between this and 'parsearchive'. Care to enlighten me? :) Stephen > > --- > > This isn't that useful per se, but I'm using it to benchmark > the performance impact of changes look for opportunities to > speed up. > --- > patchwork/bin/parsemail-batch.sh | 25 +++----- > patchwork/management/commands/parsemail-batch.py | 77 > ++++++++++++++++++++++++ > 2 files changed, 85 insertions(+), 17 deletions(-) > create mode 100644 patchwork/management/commands/parsemail-batch.py > > diff --git a/patchwork/bin/parsemail-batch.sh > b/patchwork/bin/parsemail-batch.sh > index d42712ed0995..f35808e9d11b 100755 > --- a/patchwork/bin/parsemail-batch.sh > +++ b/patchwork/bin/parsemail-batch.sh > @@ -20,25 +20,16 @@ > # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- > 1307 USA > > BIN_DIR=$(dirname "$0") > +PATCHWORK_BASE=$(readlink -e "$BIN_DIR/../..") > > -if [ $# -lt 1 ]; then > - echo "usage: $0 <dir> [options]" >&2 > - exit 1 > +if [ -z "$PW_PYTHON" ]; then > + PW_PYTHON=python2 > fi > > -mail_dir="$1" > - > -echo "dir: $mail_dir" > - > -if [ ! -d "$mail_dir" ]; then > - echo "$mail_dir should be a directory"? >&2 > - exit 1 > +if [ -z "$DJANGO_SETTINGS_MODULE" ]; then > + DJANGO_SETTINGS_MODULE=patchwork.settings.production > fi > > -shift > - > -find "$mail_dir" -maxdepth 1 | > -while read -r line; do > - echo "$line" > - "$BIN_DIR/parsemail.sh" "$@" < "$line" > -done > +PYTHONPATH="${PATCHWORK_BASE}:${PATCHWORK_BASE}/lib/python:$PYTHONPA > TH" \ > + DJANGO_SETTINGS_MODULE="$DJANGO_SETTINGS_MODULE" \ > + "$PW_PYTHON" "$PATCHWORK_BASE/manage.py" parsemail-batch "$@" > diff --git a/patchwork/management/commands/parsemail-batch.py > b/patchwork/management/commands/parsemail-batch.py > new file mode 100644 > index 000000000000..ad4dce35f595 > --- /dev/null > +++ b/patchwork/management/commands/parsemail-batch.py > @@ -0,0 +1,77 @@ > +# Patchwork - automated patch tracking system > +# Copyright (C) 2017, Daniel Axtens, IBM Corporation. > +# > +# This file is part of the Patchwork package. > +# > +# Patchwork is free software; you can redistribute it and/or modify > +# it under the terms of the GNU General Public License as published > by > +# the Free Software Foundation; either version 2 of the License, or > +# (at your option) any later version. > +# > +# Patchwork is distributed in the hope that it will be useful, > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > +# GNU General Public License for more details. > + > +import email > +import logging > +from optparse import make_option > +import sys > +import os > + > +import django > +from django.core.management import base > +from django.utils import six > + > +from patchwork.parser import parse_mail > + > +logger = logging.getLogger(__name__) > + > + > +class Command(base.BaseCommand): > + help = 'Parse a directory of mbox files and store any > patches/comments found.' > + > + if django.VERSION < (1, 8): > + args = '<indir>' > + option_list = base.BaseCommand.option_list + ( > + make_option( > + '--list-id', > + help='mailing list ID. If not supplied, this will be > ' > + 'extracted from the mail headers.'), > + ) > + else: > + def add_arguments(self, parser): > + parser.add_argument( > + 'indir', > + type=str, > + default=None, > + help='input mbox directory') > + parser.add_argument( > + '--list-id', > + help='mailing list ID. If not supplied, this will be > ' > + 'extracted from the mail headers.') > + > + def handle(self, *args, **options): > + indir = args[0] if args else options['indir'] > + > + if not os.path.isdir(indir): > + logger.error('%s is not a directory' % indir) > + sys.exit(1) > + > + for infile in sorted(os.listdir(indir)): > + print(infile) > + if six.PY3: > + with open(os.path.join(indir, infile), 'rb') as > file_: > + mail = email.message_from_binary_file(file_) > + else: > + with open(os.path.join(indir, infile)) as file_: > + mail = email.message_from_file(file_) > + > + try: > + result = parse_mail(mail, options['list_id']) > + if not result: > + logger.warning('Failed to parse mail') > + except Exception as e: > + logger.warning('Error when parsing incoming email ' > + \ > + str(e), > + extra={'mail': mail.as_string()})
Stephen Finucane <stephen@that.guru> writes: > On Fri, 2017-02-24 at 15:28 +1100, Daniel Axtens wrote: >> This brings it into line with the other import commands. >> >> It's also an order of magnitude or so faster. >> >> Signed-off-by: Daniel Axtens <dja@axtens.net> > > I never got the difference between this and 'parsearchive'. Care to > enlighten me? :) parsemail-batch takes a directory, parsearchive takes an mbox file. An mbox file is basically a concatenation of messages, so you can convert between the two, but while there are tools to go from mbox to maildir, I haven't found a tool to go the other way. You have to add a Unix From line and getting the delimiters right is a bit tricky/annoying. As a notmuch user, my mail is automatically in a directory, and a combination of notmuch search and xargs allows me to create an arbitrary directory of mail matching a set of criteria, for example: notmuch search --output=files tag:lkml | grep -v 'sent/cur' | xargs cp -t lkml gets me mail I received from LKML. So I use parsemail-batch *a lot*. Also, with this, we could in future reduce all the shell commands to 1 single little wrappers that checks what it's name was and calls the right management command - like 'python manage.py $(basename $0 .sh)'. We can then symlink it in to place under different names. Given how easy shell scripts are to screw up, this would reduce our scope for error. Texinfo does something like this too, which is where I got the idea. Regards, Daniel > > Stephen > >> >> --- >> >> This isn't that useful per se, but I'm using it to benchmark >> the performance impact of changes look for opportunities to >> speed up. >> --- >> patchwork/bin/parsemail-batch.sh | 25 +++----- >> patchwork/management/commands/parsemail-batch.py | 77 >> ++++++++++++++++++++++++ >> 2 files changed, 85 insertions(+), 17 deletions(-) >> create mode 100644 patchwork/management/commands/parsemail-batch.py >> >> diff --git a/patchwork/bin/parsemail-batch.sh >> b/patchwork/bin/parsemail-batch.sh >> index d42712ed0995..f35808e9d11b 100755 >> --- a/patchwork/bin/parsemail-batch.sh >> +++ b/patchwork/bin/parsemail-batch.sh >> @@ -20,25 +20,16 @@ >> # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- >> 1307 USA >> >> BIN_DIR=$(dirname "$0") >> +PATCHWORK_BASE=$(readlink -e "$BIN_DIR/../..") >> >> -if [ $# -lt 1 ]; then >> - echo "usage: $0 <dir> [options]" >&2 >> - exit 1 >> +if [ -z "$PW_PYTHON" ]; then >> + PW_PYTHON=python2 >> fi >> >> -mail_dir="$1" >> - >> -echo "dir: $mail_dir" >> - >> -if [ ! -d "$mail_dir" ]; then >> - echo "$mail_dir should be a directory"? >&2 >> - exit 1 >> +if [ -z "$DJANGO_SETTINGS_MODULE" ]; then >> + DJANGO_SETTINGS_MODULE=patchwork.settings.production >> fi >> >> -shift >> - >> -find "$mail_dir" -maxdepth 1 | >> -while read -r line; do >> - echo "$line" >> - "$BIN_DIR/parsemail.sh" "$@" < "$line" >> -done >> +PYTHONPATH="${PATCHWORK_BASE}:${PATCHWORK_BASE}/lib/python:$PYTHONPA >> TH" \ >> + DJANGO_SETTINGS_MODULE="$DJANGO_SETTINGS_MODULE" \ >> + "$PW_PYTHON" "$PATCHWORK_BASE/manage.py" parsemail-batch "$@" >> diff --git a/patchwork/management/commands/parsemail-batch.py >> b/patchwork/management/commands/parsemail-batch.py >> new file mode 100644 >> index 000000000000..ad4dce35f595 >> --- /dev/null >> +++ b/patchwork/management/commands/parsemail-batch.py >> @@ -0,0 +1,77 @@ >> +# Patchwork - automated patch tracking system >> +# Copyright (C) 2017, Daniel Axtens, IBM Corporation. >> +# >> +# This file is part of the Patchwork package. >> +# >> +# Patchwork is free software; you can redistribute it and/or modify >> +# it under the terms of the GNU General Public License as published >> by >> +# the Free Software Foundation; either version 2 of the License, or >> +# (at your option) any later version. >> +# >> +# Patchwork is distributed in the hope that it will be useful, >> +# but WITHOUT ANY WARRANTY; without even the implied warranty of >> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> +# GNU General Public License for more details. >> + >> +import email >> +import logging >> +from optparse import make_option >> +import sys >> +import os >> + >> +import django >> +from django.core.management import base >> +from django.utils import six >> + >> +from patchwork.parser import parse_mail >> + >> +logger = logging.getLogger(__name__) >> + >> + >> +class Command(base.BaseCommand): >> + help = 'Parse a directory of mbox files and store any >> patches/comments found.' >> + >> + if django.VERSION < (1, 8): >> + args = '<indir>' >> + option_list = base.BaseCommand.option_list + ( >> + make_option( >> + '--list-id', >> + help='mailing list ID. If not supplied, this will be >> ' >> + 'extracted from the mail headers.'), >> + ) >> + else: >> + def add_arguments(self, parser): >> + parser.add_argument( >> + 'indir', >> + type=str, >> + default=None, >> + help='input mbox directory') >> + parser.add_argument( >> + '--list-id', >> + help='mailing list ID. If not supplied, this will be >> ' >> + 'extracted from the mail headers.') >> + >> + def handle(self, *args, **options): >> + indir = args[0] if args else options['indir'] >> + >> + if not os.path.isdir(indir): >> + logger.error('%s is not a directory' % indir) >> + sys.exit(1) >> + >> + for infile in sorted(os.listdir(indir)): >> + print(infile) >> + if six.PY3: >> + with open(os.path.join(indir, infile), 'rb') as >> file_: >> + mail = email.message_from_binary_file(file_) >> + else: >> + with open(os.path.join(indir, infile)) as file_: >> + mail = email.message_from_file(file_) >> + >> + try: >> + result = parse_mail(mail, options['list_id']) >> + if not result: >> + logger.warning('Failed to parse mail') >> + except Exception as e: >> + logger.warning('Error when parsing incoming email ' >> + \ >> + str(e), >> + extra={'mail': mail.as_string()})
On Sun, 2017-02-26 at 10:56 +1100, Daniel Axtens wrote: > Stephen Finucane <stephen@that.guru> writes: > > > On Fri, 2017-02-24 at 15:28 +1100, Daniel Axtens wrote: > > > This brings it into line with the other import commands. > > > > > > It's also an order of magnitude or so faster. > > > > > > Signed-off-by: Daniel Axtens <dja@axtens.net> > > > > I never got the difference between this and 'parsearchive'. Care to > > enlighten me? :) > > parsemail-batch takes a directory, parsearchive takes an mbox file. Out of curiosity, are you referring to a "maildir"? If so, see below. > An mbox file is basically a concatenation of messages, so you can > convert between the two, but while there are tools to go from mbox to > maildir, I haven't found a tool to go the other way. You have to add > a > Unix From line and getting the delimiters right is a bit > tricky/annoying. I don't know about converting between the two, but it would seem the 'mailbox' library that we use for reading mbox files in 'parsearchive' [1] also supports parsing of Maildirs [2]. I wonder if we could pass a flag to this script instead or even detect if the target is a file (mbox) or directory (maildir)? [1] https://github.com/getpatchwork/patchwork/blob/07835705/patchwork/m anagement/commands/parsearchive.py#L72 [2] https://pymotw.com/2/mailbox/#maildir > As a notmuch user, my mail is automatically in a directory, and a > combination of notmuch search and xargs allows me to create an > arbitrary > directory of mail matching a set of criteria, for example: > > notmuch search --output=files tag:lkml | grep -v 'sent/cur' | xargs > cp -t lkml > > gets me mail I received from LKML. Very nice. I wasn't aware of this tool. That might be something worth documenting in the user manual. > So I use parsemail-batch *a lot*. > > Also, with this, we could in future reduce all the shell commands to > 1 > single little wrappers that checks what it's name was and calls the > right management command - like 'python manage.py $(basename $0 > .sh)'. We can then symlink it in to place under different names. For sure. I tried to unite the parsemail and parsearchive scripts when converting them to management commands but I couldn't find a way to read from stdin with the 'mailbox' library [3], unlike the 'email' library [4]. [3] https://github.com/getpatchwork/patchwork/blob/07835705/patchwork/m anagement/commands/parsearchive.py#L66 [4] https://github.com/getpatchwork/patchwork/blob/07835705/patchwork/m anagement/commands/parsemail.py#L61-L68 > Given how easy shell scripts are to screw up, this would reduce our > scope for error. Texinfo does something like this too, which is where > I > got the idea. Yup, I'm 100% behind doing this. I'm just making sure we're not introducing unnecessary duplication while doing so :) Stephen > Regards, > Daniel > > > > > Stephen > > > > > > > > --- > > > > > > This isn't that useful per se, but I'm using it to benchmark > > > the performance impact of changes look for opportunities to > > > speed up. > > > --- > > > patchwork/bin/parsemail-batch.sh | 25 +++----- > > > patchwork/management/commands/parsemail-batch.py | 77 > > > ++++++++++++++++++++++++ > > > 2 files changed, 85 insertions(+), 17 deletions(-) > > > create mode 100644 patchwork/management/commands/parsemail- > > > batch.py > > > > > > diff --git a/patchwork/bin/parsemail-batch.sh > > > b/patchwork/bin/parsemail-batch.sh > > > index d42712ed0995..f35808e9d11b 100755 > > > --- a/patchwork/bin/parsemail-batch.sh > > > +++ b/patchwork/bin/parsemail-batch.sh > > > @@ -20,25 +20,16 @@ > > > # Foundation, Inc., 59 Temple Place, Suite 330, Boston, > > > MA 02111- > > > 1307 USA > > > > > > BIN_DIR=$(dirname "$0") > > > +PATCHWORK_BASE=$(readlink -e "$BIN_DIR/../..") > > > > > > -if [ $# -lt 1 ]; then > > > - echo "usage: $0 <dir> [options]" >&2 > > > - exit 1 > > > +if [ -z "$PW_PYTHON" ]; then > > > + PW_PYTHON=python2 > > > fi > > > > > > -mail_dir="$1" > > > - > > > -echo "dir: $mail_dir" > > > - > > > -if [ ! -d "$mail_dir" ]; then > > > - echo "$mail_dir should be a directory"? >&2 > > > - exit 1 > > > +if [ -z "$DJANGO_SETTINGS_MODULE" ]; then > > > + DJANGO_SETTINGS_MODULE=patchwork.settings.production > > > fi > > > > > > -shift > > > - > > > -find "$mail_dir" -maxdepth 1 | > > > -while read -r line; do > > > - echo "$line" > > > - "$BIN_DIR/parsemail.sh" "$@" < "$line" > > > -done > > > +PYTHONPATH="${PATCHWORK_BASE}:${PATCHWORK_BASE}/lib/python:$PYTH > > > ONPA > > > TH" \ > > > + DJANGO_SETTINGS_MODULE="$DJANGO_SETTINGS_MODULE" \ > > > + "$PW_PYTHON" "$PATCHWORK_BASE/manage.py" parsemail-batch > > > "$@" > > > diff --git a/patchwork/management/commands/parsemail-batch.py > > > b/patchwork/management/commands/parsemail-batch.py > > > new file mode 100644 > > > index 000000000000..ad4dce35f595 > > > --- /dev/null > > > +++ b/patchwork/management/commands/parsemail-batch.py > > > @@ -0,0 +1,77 @@ > > > +# Patchwork - automated patch tracking system > > > +# Copyright (C) 2017, Daniel Axtens, IBM Corporation. > > > +# > > > +# This file is part of the Patchwork package. > > > +# > > > +# Patchwork is free software; you can redistribute it and/or > > > modify > > > +# it under the terms of the GNU General Public License as > > > published > > > by > > > +# the Free Software Foundation; either version 2 of the License, > > > or > > > +# (at your option) any later version. > > > +# > > > +# Patchwork is distributed in the hope that it will be useful, > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > > +# GNU General Public License for more details. > > > + > > > +import email > > > +import logging > > > +from optparse import make_option > > > +import sys > > > +import os > > > + > > > +import django > > > +from django.core.management import base > > > +from django.utils import six > > > + > > > +from patchwork.parser import parse_mail > > > + > > > +logger = logging.getLogger(__name__) > > > + > > > + > > > +class Command(base.BaseCommand): > > > + help = 'Parse a directory of mbox files and store any > > > patches/comments found.' > > > + > > > + if django.VERSION < (1, 8): > > > + args = '<indir>' > > > + option_list = base.BaseCommand.option_list + ( > > > + make_option( > > > + '--list-id', > > > + help='mailing list ID. If not supplied, this > > > will be > > > ' > > > + 'extracted from the mail headers.'), > > > + ) > > > + else: > > > + def add_arguments(self, parser): > > > + parser.add_argument( > > > + 'indir', > > > + type=str, > > > + default=None, > > > + help='input mbox directory') > > > + parser.add_argument( > > > + '--list-id', > > > + help='mailing list ID. If not supplied, this > > > will be > > > ' > > > + 'extracted from the mail headers.') > > > + > > > + def handle(self, *args, **options): > > > + indir = args[0] if args else options['indir'] > > > + > > > + if not os.path.isdir(indir): > > > + logger.error('%s is not a directory' % indir) > > > + sys.exit(1) > > > + > > > + for infile in sorted(os.listdir(indir)): > > > + print(infile) > > > + if six.PY3: > > > + with open(os.path.join(indir, infile), 'rb') as > > > file_: > > > + mail = email.message_from_binary_file(file_) > > > + else: > > > + with open(os.path.join(indir, infile)) as file_: > > > + mail = email.message_from_file(file_) > > > + > > > + try: > > > + result = parse_mail(mail, options['list_id']) > > > + if not result: > > > + logger.warning('Failed to parse mail') > > > + except Exception as e: > > > + logger.warning('Error when parsing incoming > > > email ' > > > + \ > > > + str(e), > > > + extra={'mail': mail.as_string()})
diff --git a/patchwork/bin/parsemail-batch.sh b/patchwork/bin/parsemail-batch.sh index d42712ed0995..f35808e9d11b 100755 --- a/patchwork/bin/parsemail-batch.sh +++ b/patchwork/bin/parsemail-batch.sh @@ -20,25 +20,16 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA BIN_DIR=$(dirname "$0") +PATCHWORK_BASE=$(readlink -e "$BIN_DIR/../..") -if [ $# -lt 1 ]; then - echo "usage: $0 <dir> [options]" >&2 - exit 1 +if [ -z "$PW_PYTHON" ]; then + PW_PYTHON=python2 fi -mail_dir="$1" - -echo "dir: $mail_dir" - -if [ ! -d "$mail_dir" ]; then - echo "$mail_dir should be a directory"? >&2 - exit 1 +if [ -z "$DJANGO_SETTINGS_MODULE" ]; then + DJANGO_SETTINGS_MODULE=patchwork.settings.production fi -shift - -find "$mail_dir" -maxdepth 1 | -while read -r line; do - echo "$line" - "$BIN_DIR/parsemail.sh" "$@" < "$line" -done +PYTHONPATH="${PATCHWORK_BASE}:${PATCHWORK_BASE}/lib/python:$PYTHONPATH" \ + DJANGO_SETTINGS_MODULE="$DJANGO_SETTINGS_MODULE" \ + "$PW_PYTHON" "$PATCHWORK_BASE/manage.py" parsemail-batch "$@" diff --git a/patchwork/management/commands/parsemail-batch.py b/patchwork/management/commands/parsemail-batch.py new file mode 100644 index 000000000000..ad4dce35f595 --- /dev/null +++ b/patchwork/management/commands/parsemail-batch.py @@ -0,0 +1,77 @@ +# Patchwork - automated patch tracking system +# Copyright (C) 2017, Daniel Axtens, IBM Corporation. +# +# This file is part of the Patchwork package. +# +# Patchwork is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# Patchwork is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +import email +import logging +from optparse import make_option +import sys +import os + +import django +from django.core.management import base +from django.utils import six + +from patchwork.parser import parse_mail + +logger = logging.getLogger(__name__) + + +class Command(base.BaseCommand): + help = 'Parse a directory of mbox files and store any patches/comments found.' + + if django.VERSION < (1, 8): + args = '<indir>' + option_list = base.BaseCommand.option_list + ( + make_option( + '--list-id', + help='mailing list ID. If not supplied, this will be ' + 'extracted from the mail headers.'), + ) + else: + def add_arguments(self, parser): + parser.add_argument( + 'indir', + type=str, + default=None, + help='input mbox directory') + parser.add_argument( + '--list-id', + help='mailing list ID. If not supplied, this will be ' + 'extracted from the mail headers.') + + def handle(self, *args, **options): + indir = args[0] if args else options['indir'] + + if not os.path.isdir(indir): + logger.error('%s is not a directory' % indir) + sys.exit(1) + + for infile in sorted(os.listdir(indir)): + print(infile) + if six.PY3: + with open(os.path.join(indir, infile), 'rb') as file_: + mail = email.message_from_binary_file(file_) + else: + with open(os.path.join(indir, infile)) as file_: + mail = email.message_from_file(file_) + + try: + result = parse_mail(mail, options['list_id']) + if not result: + logger.warning('Failed to parse mail') + except Exception as e: + logger.warning('Error when parsing incoming email ' + \ + str(e), + extra={'mail': mail.as_string()})
This brings it into line with the other import commands. It's also an order of magnitude or so faster. Signed-off-by: Daniel Axtens <dja@axtens.net> --- This isn't that useful per se, but I'm using it to benchmark the performance impact of changes look for opportunities to speed up. --- patchwork/bin/parsemail-batch.sh | 25 +++----- patchwork/management/commands/parsemail-batch.py | 77 ++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 17 deletions(-) create mode 100644 patchwork/management/commands/parsemail-batch.py