chrono/src/format/scan.rs
2023-06-29 14:07:56 +02:00

421 lines
14 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// This is a part of Chrono.
// See README.md and LICENSE.txt for details.
/*!
* Various scanning routines for the parser.
*/
use super::{ParseResult, INVALID, OUT_OF_RANGE, TOO_SHORT};
use crate::Weekday;
/// Tries to parse the non-negative number from `min` to `max` digits.
///
/// The absence of digits at all is an unconditional error.
/// More than `max` digits are consumed up to the first `max` digits.
/// Any number that does not fit in `i64` is an error.
#[inline]
pub(super) fn number(s: &str, min: usize, max: usize) -> ParseResult<(&str, i64)> {
assert!(min <= max);
// We are only interested in ascii numbers, so we can work with the `str` as bytes. We stop on
// the first non-numeric byte, which may be another ascii character or beginning of multi-byte
// UTF-8 character.
let bytes = s.as_bytes();
if bytes.len() < min {
return Err(TOO_SHORT);
}
let mut n = 0i64;
for (i, c) in bytes.iter().take(max).cloned().enumerate() {
// cloned() = copied()
if !c.is_ascii_digit() {
if i < min {
return Err(INVALID);
} else {
return Ok((&s[i..], n));
}
}
n = match n.checked_mul(10).and_then(|n| n.checked_add((c - b'0') as i64)) {
Some(n) => n,
None => return Err(OUT_OF_RANGE),
};
}
Ok((&s[core::cmp::min(max, bytes.len())..], n))
}
/// Tries to consume at least one digits as a fractional second.
/// Returns the number of whole nanoseconds (0--999,999,999).
pub(super) fn nanosecond(s: &str) -> ParseResult<(&str, i64)> {
// record the number of digits consumed for later scaling.
let origlen = s.len();
let (s, v) = number(s, 1, 9)?;
let consumed = origlen - s.len();
// scale the number accordingly.
static SCALE: [i64; 10] =
[0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1];
let v = v.checked_mul(SCALE[consumed]).ok_or(OUT_OF_RANGE)?;
// if there are more than 9 digits, skip next digits.
let s = s.trim_start_matches(|c: char| c.is_ascii_digit());
Ok((s, v))
}
/// Tries to consume a fixed number of digits as a fractional second.
/// Returns the number of whole nanoseconds (0--999,999,999).
pub(super) fn nanosecond_fixed(s: &str, digits: usize) -> ParseResult<(&str, i64)> {
// record the number of digits consumed for later scaling.
let (s, v) = number(s, digits, digits)?;
// scale the number accordingly.
static SCALE: [i64; 10] =
[0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1];
let v = v.checked_mul(SCALE[digits]).ok_or(OUT_OF_RANGE)?;
Ok((s, v))
}
/// Tries to parse the month index (0 through 11) with the first three ASCII letters.
pub(super) fn short_month0(s: &str) -> ParseResult<(&str, u8)> {
if s.len() < 3 {
return Err(TOO_SHORT);
}
let buf = s.as_bytes();
let month0 = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) {
(b'j', b'a', b'n') => 0,
(b'f', b'e', b'b') => 1,
(b'm', b'a', b'r') => 2,
(b'a', b'p', b'r') => 3,
(b'm', b'a', b'y') => 4,
(b'j', b'u', b'n') => 5,
(b'j', b'u', b'l') => 6,
(b'a', b'u', b'g') => 7,
(b's', b'e', b'p') => 8,
(b'o', b'c', b't') => 9,
(b'n', b'o', b'v') => 10,
(b'd', b'e', b'c') => 11,
_ => return Err(INVALID),
};
Ok((&s[3..], month0))
}
/// Tries to parse the weekday with the first three ASCII letters.
pub(super) fn short_weekday(s: &str) -> ParseResult<(&str, Weekday)> {
if s.len() < 3 {
return Err(TOO_SHORT);
}
let buf = s.as_bytes();
let weekday = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) {
(b'm', b'o', b'n') => Weekday::Mon,
(b't', b'u', b'e') => Weekday::Tue,
(b'w', b'e', b'd') => Weekday::Wed,
(b't', b'h', b'u') => Weekday::Thu,
(b'f', b'r', b'i') => Weekday::Fri,
(b's', b'a', b't') => Weekday::Sat,
(b's', b'u', b'n') => Weekday::Sun,
_ => return Err(INVALID),
};
Ok((&s[3..], weekday))
}
/// Tries to parse the month index (0 through 11) with short or long month names.
/// It prefers long month names to short month names when both are possible.
pub(super) fn short_or_long_month0(s: &str) -> ParseResult<(&str, u8)> {
// lowercased month names, minus first three chars
static LONG_MONTH_SUFFIXES: [&[u8]; 12] = [
b"uary", b"ruary", b"ch", b"il", b"", b"e", b"y", b"ust", b"tember", b"ober", b"ember",
b"ember",
];
let (mut s, month0) = short_month0(s)?;
// tries to consume the suffix if possible
let suffix = LONG_MONTH_SUFFIXES[month0 as usize];
if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) {
s = &s[suffix.len()..];
}
Ok((s, month0))
}
/// Tries to parse the weekday with short or long weekday names.
/// It prefers long weekday names to short weekday names when both are possible.
pub(super) fn short_or_long_weekday(s: &str) -> ParseResult<(&str, Weekday)> {
// lowercased weekday names, minus first three chars
static LONG_WEEKDAY_SUFFIXES: [&[u8]; 7] =
[b"day", b"sday", b"nesday", b"rsday", b"day", b"urday", b"day"];
let (mut s, weekday) = short_weekday(s)?;
// tries to consume the suffix if possible
let suffix = LONG_WEEKDAY_SUFFIXES[weekday.num_days_from_monday() as usize];
if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) {
s = &s[suffix.len()..];
}
Ok((s, weekday))
}
/// Tries to consume exactly one given character.
pub(super) fn char(s: &str, c1: u8) -> ParseResult<&str> {
match s.as_bytes().first() {
Some(&c) if c == c1 => Ok(&s[1..]),
Some(_) => Err(INVALID),
None => Err(TOO_SHORT),
}
}
/// Tries to consume one or more whitespace.
pub(super) fn space(s: &str) -> ParseResult<&str> {
let s_ = s.trim_start();
if s_.len() < s.len() {
Ok(s_)
} else if s.is_empty() {
Err(TOO_SHORT)
} else {
Err(INVALID)
}
}
/// Consumes any number (including zero) of colon or spaces.
pub(crate) fn colon_or_space(s: &str) -> ParseResult<&str> {
Ok(s.trim_start_matches(|c: char| c == ':' || c.is_whitespace()))
}
/// Parse a timezone from `s` and return the offset in seconds.
///
/// The `consume_colon` function is used to parse a mandatory or optional `:`
/// separator between hours offset and minutes offset.
///
/// The `allow_missing_minutes` flag allows the timezone minutes offset to be
/// missing from `s`.
///
/// The `allow_tz_minus_sign` flag allows the timezone offset negative character
/// to also be `` MINUS SIGN (U+2212) in addition to the typical
/// ASCII-compatible `-` HYPHEN-MINUS (U+2D).
/// This is part of [RFC 3339 & ISO 8601].
///
/// [RFC 3339 & ISO 8601]: https://en.wikipedia.org/w/index.php?title=ISO_8601&oldid=1114309368#Time_offsets_from_UTC
pub(crate) fn timezone_offset<F>(
mut s: &str,
mut consume_colon: F,
allow_missing_minutes: bool,
allow_tz_minus_sign: bool,
) -> ParseResult<(&str, i32)>
where
F: FnMut(&str) -> ParseResult<&str>,
{
const fn digits(s: &str) -> ParseResult<(u8, u8)> {
let b = s.as_bytes();
if b.len() < 2 {
Err(TOO_SHORT)
} else {
Ok((b[0], b[1]))
}
}
let negative = match s.chars().next() {
Some('+') => {
// PLUS SIGN (U+2B)
s = &s['+'.len_utf8()..];
false
}
Some('-') => {
// HYPHEN-MINUS (U+2D)
s = &s['-'.len_utf8()..];
true
}
Some('') => {
// MINUS SIGN (U+2212)
if !allow_tz_minus_sign {
return Err(INVALID);
}
s = &s[''.len_utf8()..];
true
}
Some(_) => return Err(INVALID),
None => return Err(TOO_SHORT),
};
// hours (00--99)
let hours = match digits(s)? {
(h1 @ b'0'..=b'9', h2 @ b'0'..=b'9') => i32::from((h1 - b'0') * 10 + (h2 - b'0')),
_ => return Err(INVALID),
};
s = &s[2..];
// colons (and possibly other separators)
s = consume_colon(s)?;
// minutes (00--59)
// if the next two items are digits then we have to add minutes
let minutes = if let Ok(ds) = digits(s) {
match ds {
(m1 @ b'0'..=b'5', m2 @ b'0'..=b'9') => i32::from((m1 - b'0') * 10 + (m2 - b'0')),
(b'6'..=b'9', b'0'..=b'9') => return Err(OUT_OF_RANGE),
_ => return Err(INVALID),
}
} else if allow_missing_minutes {
0
} else {
return Err(TOO_SHORT);
};
s = match s.len() {
len if len >= 2 => &s[2..],
len if len == 0 => s,
_ => return Err(TOO_SHORT),
};
let seconds = hours * 3600 + minutes * 60;
Ok((s, if negative { -seconds } else { seconds }))
}
/// Same as `timezone_offset` but also allows for `z`/`Z` which is the same as `+00:00`.
pub(super) fn timezone_offset_zulu<F>(s: &str, colon: F) -> ParseResult<(&str, i32)>
where
F: FnMut(&str) -> ParseResult<&str>,
{
let bytes = s.as_bytes();
match bytes.first() {
Some(&b'z') | Some(&b'Z') => Ok((&s[1..], 0)),
Some(&b'u') | Some(&b'U') => {
if bytes.len() >= 3 {
let (b, c) = (bytes[1], bytes[2]);
match (b | 32, c | 32) {
(b't', b'c') => Ok((&s[3..], 0)),
_ => Err(INVALID),
}
} else {
Err(INVALID)
}
}
_ => timezone_offset(s, colon, false, true),
}
}
/// Same as `timezone_offset` but also allows for `z`/`Z` which is the same as
/// `+00:00`, and allows missing minutes entirely.
pub(super) fn timezone_offset_permissive<F>(s: &str, colon: F) -> ParseResult<(&str, i32)>
where
F: FnMut(&str) -> ParseResult<&str>,
{
match s.as_bytes().first() {
Some(&b'z') | Some(&b'Z') => Ok((&s[1..], 0)),
_ => timezone_offset(s, colon, true, true),
}
}
/// Same as `timezone_offset` but also allows for RFC 2822 legacy timezones.
/// May return `None` which indicates an insufficient offset data (i.e. `-0000`).
/// See [RFC 2822 Section 4.3].
///
/// [RFC 2822 Section 4.3]: https://tools.ietf.org/html/rfc2822#section-4.3
pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, Option<i32>)> {
// tries to parse legacy time zone names
let upto = s.as_bytes().iter().position(|&c| !c.is_ascii_alphabetic()).unwrap_or(s.len());
if upto > 0 {
let name = &s.as_bytes()[..upto];
let s = &s[upto..];
let offset_hours = |o| Ok((s, Some(o * 3600)));
if name.eq_ignore_ascii_case(b"gmt") || name.eq_ignore_ascii_case(b"ut") {
offset_hours(0)
} else if name.eq_ignore_ascii_case(b"edt") {
offset_hours(-4)
} else if name.eq_ignore_ascii_case(b"est") || name.eq_ignore_ascii_case(b"cdt") {
offset_hours(-5)
} else if name.eq_ignore_ascii_case(b"cst") || name.eq_ignore_ascii_case(b"mdt") {
offset_hours(-6)
} else if name.eq_ignore_ascii_case(b"mst") || name.eq_ignore_ascii_case(b"pdt") {
offset_hours(-7)
} else if name.eq_ignore_ascii_case(b"pst") {
offset_hours(-8)
} else if name.len() == 1 {
match name[0] {
// recommended by RFC 2822: consume but treat it as -0000
b'a'..=b'i' | b'k'..=b'z' | b'A'..=b'I' | b'K'..=b'Z' => offset_hours(0),
_ => Ok((s, None)),
}
} else {
Ok((s, None))
}
} else {
let (s_, offset) = timezone_offset(s, |s| Ok(s), false, false)?;
Ok((s_, Some(offset)))
}
}
/// Tries to consume an RFC2822 comment including preceding ` `.
///
/// Returns the remaining string after the closing parenthesis.
pub(super) fn comment_2822(s: &str) -> ParseResult<(&str, ())> {
use CommentState::*;
let s = s.trim_start();
let mut state = Start;
for (i, c) in s.bytes().enumerate() {
state = match (state, c) {
(Start, b'(') => Next(1),
(Next(1), b')') => return Ok((&s[i + 1..], ())),
(Next(depth), b'\\') => Escape(depth),
(Next(depth), b'(') => Next(depth + 1),
(Next(depth), b')') => Next(depth - 1),
(Next(depth), _) | (Escape(depth), _) => Next(depth),
_ => return Err(INVALID),
};
}
Err(TOO_SHORT)
}
enum CommentState {
Start,
Next(usize),
Escape(usize),
}
#[cfg(test)]
mod tests {
use super::comment_2822;
use crate::format::{INVALID, TOO_SHORT};
#[test]
fn test_rfc2822_comments() {
let testdata = [
("", Err(TOO_SHORT)),
(" ", Err(TOO_SHORT)),
("x", Err(INVALID)),
("(", Err(TOO_SHORT)),
("()", Ok("")),
(" \r\n\t()", Ok("")),
("() ", Ok(" ")),
("()z", Ok("z")),
("(x)", Ok("")),
("(())", Ok("")),
("((()))", Ok("")),
("(x(x(x)x)x)", Ok("")),
("( x ( x ( x ) x ) x )", Ok("")),
(r"(\)", Err(TOO_SHORT)),
(r"(\()", Ok("")),
(r"(\))", Ok("")),
(r"(\\)", Ok("")),
("(()())", Ok("")),
("( x ( x ) x ( x ) x )", Ok("")),
];
for (test_in, expected) in testdata.iter() {
let actual = comment_2822(test_in).map(|(s, _)| s);
assert_eq!(
*expected, actual,
"{:?} expected to produce {:?}, but produced {:?}.",
test_in, expected, actual
);
}
}
}