File: //kunden/lib/python3/dist-packages/fastimport/parser.py
# Copyright (C) 2008-2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Parser of import data into command objects.
In order to reuse existing front-ends, the stream format is a subset of
the one used by git-fast-import (as of the 1.5.4 release of git at least).
The grammar is:
stream ::= cmd*;
cmd ::= new_blob
| new_commit
| new_tag
| reset_branch
| checkpoint
| progress
;
new_blob ::= 'blob' lf
mark?
file_content;
file_content ::= data;
new_commit ::= 'commit' sp ref_str lf
mark?
('author' sp name '<' email '>' when lf)?
'committer' sp name '<' email '>' when lf
commit_msg
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
file_change*
lf?;
commit_msg ::= data;
file_change ::= file_clr
| file_del
| file_rnm
| file_cpy
| file_obm
| file_inm;
file_clr ::= 'deleteall' lf;
file_del ::= 'D' sp path_str lf;
file_rnm ::= 'R' sp path_str sp path_str lf;
file_cpy ::= 'C' sp path_str sp path_str lf;
file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
data;
new_tag ::= 'tag' sp tag_str lf
'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
'tagger' sp name '<' email '>' when lf
tag_msg;
tag_msg ::= data;
reset_branch ::= 'reset' sp ref_str lf
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
lf?;
checkpoint ::= 'checkpoint' lf
lf?;
progress ::= 'progress' sp not_lf* lf
lf?;
# note: the first idnum in a stream should be 1 and subsequent
# idnums should not have gaps between values as this will cause
# the stream parser to reserve space for the gapped values. An
# idnum can be updated in the future to a new object by issuing
# a new mark directive with the old idnum.
#
mark ::= 'mark' sp idnum lf;
data ::= (delimited_data | exact_data)
lf?;
# note: delim may be any string but must not contain lf.
# data_line may contain any data but must not be exactly
# delim. The lf after the final data_line is included in
# the data.
delimited_data ::= 'data' sp '<<' delim lf
(data_line lf)*
delim lf;
# note: declen indicates the length of binary_data in bytes.
# declen does not include the lf preceeding the binary data.
#
exact_data ::= 'data' sp declen lf
binary_data;
# note: quoted strings are C-style quoting supporting \c for
# common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
# is the signed byte value in octal. Note that the only
# characters which must actually be escaped to protect the
# stream formatting is: \, " and LF. Otherwise these values
# are UTF8.
#
ref_str ::= ref;
sha1exp_str ::= sha1exp;
tag_str ::= tag;
path_str ::= path | '"' quoted(path) '"' ;
mode ::= '100644' | '644'
| '100755' | '755'
| '120000'
;
declen ::= # unsigned 32 bit value, ascii base10 notation;
bigint ::= # unsigned integer value, ascii base10 notation;
binary_data ::= # file content, not interpreted;
when ::= raw_when | rfc2822_when;
raw_when ::= ts sp tz;
rfc2822_when ::= # Valid RFC 2822 date and time;
sp ::= # ASCII space character;
lf ::= # ASCII newline (LF) character;
# note: a colon (':') must precede the numerical value assigned to
# an idnum. This is to distinguish it from a ref or tag name as
# GIT does not permit ':' in ref or tag strings.
#
idnum ::= ':' bigint;
path ::= # GIT style file path, e.g. "a/b/c";
ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
sha1exp ::= # Any valid GIT SHA1 expression;
hexsha1 ::= # SHA1 in hexadecimal format;
# note: name and email are UTF8 strings, however name must not
# contain '<' or lf and email must not contain any of the
# following: '<', '>', lf.
#
name ::= # valid GIT author/committer name;
email ::= # valid GIT author/committer email;
ts ::= # time since the epoch in seconds, ascii base10 notation;
tz ::= # GIT style timezone;
# note: comments may appear anywhere in the input, except
# within a data command. Any form of the data command
# always escapes the related input from comment processing.
#
# In case it is not clear, the '#' that starts the comment
# must be the first character on that the line (an lf have
# preceeded it).
#
comment ::= '#' not_lf* lf;
not_lf ::= # Any byte that is not ASCII newline (LF);
"""
from __future__ import print_function
import collections
import re
import sys
import codecs
from fastimport import (
commands,
dates,
errors,
)
from fastimport.helpers import (
newobject as object,
utf8_bytes_string,
)
## Stream parsing ##
class LineBasedParser(object):
def __init__(self, input_stream):
"""A Parser that keeps track of line numbers.
:param input: the file-like object to read from
"""
self.input = input_stream
self.lineno = 0
# Lines pushed back onto the input stream
self._buffer = []
def abort(self, exception, *args):
"""Raise an exception providing line number information."""
raise exception(self.lineno, *args)
def readline(self):
"""Get the next line including the newline or '' on EOF."""
self.lineno += 1
if self._buffer:
return self._buffer.pop()
else:
return self.input.readline()
def next_line(self):
"""Get the next line without the newline or None on EOF."""
line = self.readline()
if line:
return line[:-1]
else:
return None
def push_line(self, line):
"""Push line back onto the line buffer.
:param line: the line with no trailing newline
"""
self.lineno -= 1
self._buffer.append(line + b'\n')
def read_bytes(self, count):
"""Read a given number of bytes from the input stream.
Throws MissingBytes if the bytes are not found.
Note: This method does not read from the line buffer.
:return: a string
"""
result = self.input.read(count)
found = len(result)
self.lineno += result.count(b'\n')
if found != count:
self.abort(errors.MissingBytes, count, found)
return result
def read_until(self, terminator):
"""Read the input stream until the terminator is found.
Throws MissingTerminator if the terminator is not found.
Note: This method does not read from the line buffer.
:return: the bytes read up to but excluding the terminator.
"""
lines = []
term = terminator + b'\n'
while True:
line = self.input.readline()
if line == term:
break
else:
lines.append(line)
return b''.join(lines)
# Regular expression used for parsing. (Note: The spec states that the name
# part should be non-empty but git-fast-export doesn't always do that so
# the first bit is \w*, not \w+.) Also git-fast-import code says the
# space before the email is optional.
_WHO_AND_WHEN_RE = re.compile(br'([^<]*)<(.*)> (.+)')
_WHO_RE = re.compile(br'([^<]*)<(.*)>')
class ImportParser(LineBasedParser):
def __init__(self, input_stream, verbose=False, output=sys.stdout,
user_mapper=None, strict=True):
"""A Parser of import commands.
:param input_stream: the file-like object to read from
:param verbose: display extra information of not
:param output: the file-like object to write messages to (YAGNI?)
:param user_mapper: if not None, the UserMapper used to adjust
user-ids for authors, committers and taggers.
:param strict: Raise errors on strictly invalid data
"""
LineBasedParser.__init__(self, input_stream)
self.verbose = verbose
self.output = output
self.user_mapper = user_mapper
self.strict = strict
# We auto-detect the date format when a date is first encountered
self.date_parser = None
self.features = {}
def warning(self, msg):
sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
def iter_commands(self):
"""Iterator returning ImportCommand objects."""
while True:
line = self.next_line()
if line is None:
if b'done' in self.features:
raise errors.PrematureEndOfStream(self.lineno)
break
elif len(line) == 0 or line.startswith(b'#'):
continue
# Search for commands in order of likelihood
elif line.startswith(b'commit '):
yield self._parse_commit(line[len(b'commit '):])
elif line.startswith(b'blob'):
yield self._parse_blob()
elif line.startswith(b'done'):
break
elif line.startswith(b'progress '):
yield commands.ProgressCommand(line[len(b'progress '):])
elif line.startswith(b'reset '):
yield self._parse_reset(line[len(b'reset '):])
elif line.startswith(b'tag '):
yield self._parse_tag(line[len(b'tag '):])
elif line.startswith(b'checkpoint'):
yield commands.CheckpointCommand()
elif line.startswith(b'feature'):
yield self._parse_feature(line[len(b'feature '):])
else:
self.abort(errors.InvalidCommand, line)
def iter_file_commands(self):
"""Iterator returning FileCommand objects.
If an invalid file command is found, the line is silently
pushed back and iteration ends.
"""
while True:
line = self.next_line()
if line is None:
break
elif len(line) == 0 or line.startswith(b'#'):
continue
# Search for file commands in order of likelihood
elif line.startswith(b'M '):
yield self._parse_file_modify(line[2:])
elif line.startswith(b'D '):
path = self._path(line[2:])
yield commands.FileDeleteCommand(path)
elif line.startswith(b'R '):
old, new = self._path_pair(line[2:])
yield commands.FileRenameCommand(old, new)
elif line.startswith(b'C '):
src, dest = self._path_pair(line[2:])
yield commands.FileCopyCommand(src, dest)
elif line.startswith(b'deleteall'):
yield commands.FileDeleteAllCommand()
else:
self.push_line(line)
break
def _parse_blob(self):
"""Parse a blob command."""
lineno = self.lineno
mark = self._get_mark_if_any()
data = self._get_data(b'blob')
return commands.BlobCommand(mark, data, lineno)
def _parse_commit(self, ref):
"""Parse a commit command."""
lineno = self.lineno
mark = self._get_mark_if_any()
author = self._get_user_info(b'commit', b'author', False)
more_authors = []
while True:
another_author = self._get_user_info(b'commit', b'author', False)
if another_author is not None:
more_authors.append(another_author)
else:
break
committer = self._get_user_info(b'commit', b'committer')
message = self._get_data(b'commit', b'message')
from_ = self._get_from()
merges = []
while True:
merge = self._get_merge()
if merge is not None:
# while the spec suggests it's illegal, git-fast-export
# outputs multiple merges on the one line, e.g.
# merge :x :y :z
these_merges = merge.split(b' ')
merges.extend(these_merges)
else:
break
properties = {}
while True:
name_value = self._get_property()
if name_value is not None:
name, value = name_value
properties[name] = value
else:
break
return commands.CommitCommand(ref, mark, author, committer, message,
from_, merges, list(self.iter_file_commands()), lineno=lineno,
more_authors=more_authors, properties=properties)
def _parse_feature(self, info):
"""Parse a feature command."""
parts = info.split(b'=', 1)
name = parts[0]
if len(parts) > 1:
value = self._path(parts[1])
else:
value = None
self.features[name] = value
return commands.FeatureCommand(name, value, lineno=self.lineno)
def _parse_file_modify(self, info):
"""Parse a filemodify command within a commit.
:param info: a string in the format "mode dataref path"
(where dataref might be the hard-coded literal 'inline').
"""
params = info.split(b' ', 2)
path = self._path(params[2])
mode = self._mode(params[0])
if params[1] == b'inline':
dataref = None
data = self._get_data(b'filemodify')
else:
dataref = params[1]
data = None
return commands.FileModifyCommand(path, mode, dataref,
data)
def _parse_reset(self, ref):
"""Parse a reset command."""
from_ = self._get_from()
return commands.ResetCommand(ref, from_)
def _parse_tag(self, name):
"""Parse a tag command."""
from_ = self._get_from(b'tag')
tagger = self._get_user_info(b'tag', b'tagger',
accept_just_who=True)
message = self._get_data(b'tag', b'message')
return commands.TagCommand(name, from_, tagger, message)
def _get_mark_if_any(self):
"""Parse a mark section."""
line = self.next_line()
if line.startswith(b'mark :'):
return line[len(b'mark :'):]
else:
self.push_line(line)
return None
def _get_from(self, required_for=None):
"""Parse a from section."""
line = self.next_line()
if line is None:
return None
elif line.startswith(b'from '):
return line[len(b'from '):]
elif required_for:
self.abort(errors.MissingSection, required_for, 'from')
else:
self.push_line(line)
return None
def _get_merge(self):
"""Parse a merge section."""
line = self.next_line()
if line is None:
return None
elif line.startswith(b'merge '):
return line[len(b'merge '):]
else:
self.push_line(line)
return None
def _get_property(self):
"""Parse a property section."""
line = self.next_line()
if line is None:
return None
elif line.startswith(b'property '):
return self._name_value(line[len(b'property '):])
else:
self.push_line(line)
return None
def _get_user_info(self, cmd, section, required=True,
accept_just_who=False):
"""Parse a user section."""
line = self.next_line()
if line.startswith(section + b' '):
return self._who_when(line[len(section + b' '):], cmd, section,
accept_just_who=accept_just_who)
elif required:
self.abort(errors.MissingSection, cmd, section)
else:
self.push_line(line)
return None
def _get_data(self, required_for, section=b'data'):
"""Parse a data section."""
line = self.next_line()
if line.startswith(b'data '):
rest = line[len(b'data '):]
if rest.startswith(b'<<'):
return self.read_until(rest[2:])
else:
size = int(rest)
read_bytes = self.read_bytes(size)
# optional LF after data.
next_line = self.input.readline()
self.lineno += 1
if len(next_line) > 1 or next_line != b'\n':
self.push_line(next_line[:-1])
return read_bytes
else:
self.abort(errors.MissingSection, required_for, section)
def _who_when(self, s, cmd, section, accept_just_who=False):
"""Parse who and when information from a string.
:return: a tuple of (name,email,timestamp,timezone). name may be
the empty string if only an email address was given.
"""
match = _WHO_AND_WHEN_RE.search(s)
if match:
datestr = match.group(3).lstrip()
if self.date_parser is None:
# auto-detect the date format
if len(datestr.split(b' ')) == 2:
date_format = 'raw'
elif datestr == b'now':
date_format = 'now'
else:
date_format = 'rfc2822'
self.date_parser = dates.DATE_PARSERS_BY_NAME[date_format]
try:
when = self.date_parser(datestr, self.lineno)
except ValueError:
print("failed to parse datestr '%s'" % (datestr,))
raise
name = match.group(1).rstrip()
email = match.group(2)
else:
match = _WHO_RE.search(s)
if accept_just_who and match:
# HACK around missing time
# TODO: output a warning here
when = dates.DATE_PARSERS_BY_NAME['now']('now')
name = match.group(1)
email = match.group(2)
elif self.strict:
self.abort(errors.BadFormat, cmd, section, s)
else:
name = s
email = None
when = dates.DATE_PARSERS_BY_NAME['now']('now')
if len(name) > 0:
if name.endswith(b' '):
name = name[:-1]
# While it shouldn't happen, some datasets have email addresses
# which contain unicode characters. See bug 338186. We sanitize
# the data at this level just in case.
if self.user_mapper:
name, email = self.user_mapper.map_name_and_email(name, email)
return Authorship(name, email, when[0], when[1])
def _name_value(self, s):
"""Parse a (name,value) tuple from 'name value-length value'."""
parts = s.split(b' ', 2)
name = parts[0]
if len(parts) == 1:
value = None
else:
size = int(parts[1])
value = parts[2]
still_to_read = size - len(value)
if still_to_read > 0:
read_bytes = self.read_bytes(still_to_read)
value += b'\n' + read_bytes[:still_to_read - 1]
return (name, value)
def _path(self, s):
"""Parse a path."""
if s.startswith(b'"'):
if not s.endswith(b'"'):
self.abort(errors.BadFormat, '?', '?', s)
else:
return _unquote_c_string(s[1:-1])
return s
def _path_pair(self, s):
"""Parse two paths separated by a space."""
# TODO: handle a space in the first path
if s.startswith(b'"'):
parts = s[1:].split(b'" ', 1)
else:
parts = s.split(b' ', 1)
if len(parts) != 2:
self.abort(errors.BadFormat, '?', '?', s)
elif parts[1].startswith(b'"') and parts[1].endswith(b'"'):
parts[1] = parts[1][1:-1]
elif parts[1].startswith(b'"') or parts[1].endswith(b'"'):
self.abort(errors.BadFormat, '?', '?', s)
return [_unquote_c_string(s) for s in parts]
def _mode(self, s):
"""Check file mode format and parse into an int.
:return: mode as integer
"""
# Note: Output from git-fast-export slightly different to spec
if s in [b'644', b'100644', b'0100644']:
return 0o100644
elif s in [b'755', b'100755', b'0100755']:
return 0o100755
elif s in [b'040000', b'0040000']:
return 0o40000
elif s in [b'120000', b'0120000']:
return 0o120000
elif s in [b'160000', b'0160000']:
return 0o160000
else:
self.abort(errors.BadFormat, 'filemodify', 'mode', s)
ESCAPE_SEQUENCE_BYTES_RE = re.compile(br'''
( \\U........ # 8-digit hex escapes
| \\u.... # 4-digit hex escapes
| \\x.. # 2-digit hex escapes
| \\[0-7]{1,3} # Octal escapes
| \\N\{[^}]+\} # Unicode characters by name
| \\[\\'"abfnrtv] # Single-character escapes
)''', re.VERBOSE
)
ESCAPE_SEQUENCE_RE = re.compile(r'''
( \\U........
| \\u....
| \\x..
| \\[0-7]{1,3}
| \\N\{[^}]+\}
| \\[\\'"abfnrtv]
)''', re.UNICODE | re.VERBOSE
)
def _unquote_c_string(s):
"""replace C-style escape sequences (\n, \", etc.) with real chars."""
# doing a s.encode('utf-8').decode('unicode_escape') can return an
# incorrect output with unicode string (both in py2 and py3) the safest way
# is to match the escape sequences and decoding them alone.
def decode_match(match):
return utf8_bytes_string(
codecs.decode(match.group(0), 'unicode-escape')
)
if sys.version_info[0] >= 3 and isinstance(s, bytes):
return ESCAPE_SEQUENCE_BYTES_RE.sub(decode_match, s)
else:
return ESCAPE_SEQUENCE_RE.sub(decode_match, s)
Authorship = collections.namedtuple('Authorship', 'name email timestamp timezone')