From d2f423521ec76406944ad83098ec33afe20c692b Mon Sep 17 00:00:00 2001 From: Kim Altintop Date: Mon, 9 Jan 2023 13:18:33 +0100 Subject: This is it Squashed commit of all the exploration history. Development starts here. Signed-off-by: Kim Altintop --- src/bundle/error.rs | 31 +++++ src/bundle/fetch.rs | 130 ++++++++++++++++++ src/bundle/header.rs | 365 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/bundle/list.rs | 335 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 861 insertions(+) create mode 100644 src/bundle/error.rs create mode 100644 src/bundle/fetch.rs create mode 100644 src/bundle/header.rs create mode 100644 src/bundle/list.rs (limited to 'src/bundle') diff --git a/src/bundle/error.rs b/src/bundle/error.rs new file mode 100644 index 0000000..41529c2 --- /dev/null +++ b/src/bundle/error.rs @@ -0,0 +1,31 @@ +// Copyright © 2022 Kim Altintop +// SPDX-License-Identifier: GPL-2.0-only WITH openvpn-openssl-exception + +use thiserror::Error; + +use super::{ + ObjectFormat, + ObjectId, +}; +use crate::git::refs; + +#[derive(Debug, Error)] +pub enum Header { + #[error("invalid header: {0}")] + Format(&'static str), + + #[error("unrecognised header {0}")] + UnrecognisedHeader(String), + + #[error("object id {oid} not valid for object-format {fmt}")] + ObjectFormat { fmt: ObjectFormat, oid: ObjectId }, + + #[error("invalid reference name")] + Refname(#[from] refs::error::RefFormat), + + #[error("invalid hex oid")] + Oid(#[from] hex::FromHexError), + + #[error(transparent)] + Io(#[from] std::io::Error), +} diff --git a/src/bundle/fetch.rs b/src/bundle/fetch.rs new file mode 100644 index 0000000..4e58000 --- /dev/null +++ b/src/bundle/fetch.rs @@ -0,0 +1,130 @@ +// Copyright © 2022 Kim Altintop +// SPDX-License-Identifier: GPL-2.0-only WITH openvpn-openssl-exception + +use std::{ + fs, + io::{ + self, + Read, + Seek, + SeekFrom, + Write, + }, + path::{ + Path, + PathBuf, + }, +}; + +use anyhow::ensure; +use either::Either::{ + self, + Left, + Right, +}; +use sha2::{ + Digest, + Sha256, +}; +use tempfile::NamedTempFile; +use url::Url; + +use super::{ + header, + Expect, + Header, +}; +use crate::{ + bundle, + fs::LockedFile, + git, + io::HashWriter, +}; + +const MAX_BUNDLE_URIS_BYTES: u64 = 50_000; + +pub struct Fetched { + path: PathBuf, + info: bundle::Info, +} + +impl Fetched { + pub fn into_inner(self) -> (PathBuf, bundle::Info) { + (self.path, self.info) + } +} + +pub struct Fetcher { + agent: ureq::Agent, +} + +impl Default for Fetcher { + fn default() -> Self { + Self { + agent: ureq::agent(), + } + } +} + +impl Fetcher { + pub fn fetch( + &self, + url: &Url, + out_dir: &Path, + expect: Expect, + ) -> crate::Result> { + let resp = self.agent.request_url("GET", url).call()?; + let mut body = resp.into_reader(); + + let mut buf = [0; 16]; + body.read_exact(&mut buf)?; + let is_bundle = buf.starts_with(header::SIGNATURE_V2.as_bytes()) + || buf.starts_with(header::SIGNATURE_V3.as_bytes()); + if is_bundle { + ensure!( + matches!(buf.last(), Some(b'\n')), + "malformed bundle header: trailing data" + ) + } + + if is_bundle { + let mut path = out_dir.join(expect.hash.to_string()); + path.set_extension(bundle::FILE_EXTENSION); + + let mut lck = { + fs::create_dir_all(out_dir)?; + LockedFile::atomic(&path, true, LockedFile::DEFAULT_PERMISSIONS)? + }; + + let mut out = HashWriter::new(Sha256::new(), &mut lck); + out.write_all(&buf)?; + + let len = buf.len() as u64 + io::copy(&mut body.take(expect.len), &mut out)?; + let checksum = out.hash().into(); + if let Some(chk) = expect.checksum { + ensure!(chk == checksum, "checksum mismatch"); + } + lck.seek(SeekFrom::Start(0))?; + let header = Header::from_reader(&mut lck)?; + let hash = header.hash(); + + lck.persist()?; + + let info = bundle::Info { + len, + hash, + checksum, + uris: vec![url.clone()], + }; + Ok(Right(Fetched { path, info })) + } else { + let mut tmp = NamedTempFile::new()?; + tmp.write_all(&buf)?; + io::copy(&mut body.take(MAX_BUNDLE_URIS_BYTES), &mut tmp)?; + let cfg = git::config::Snapshot::try_from(git2::Config::open(tmp.path())?)?; + let list = bundle::List::from_config(cfg)?; + + Ok(Left(list)) + } + } +} diff --git a/src/bundle/header.rs b/src/bundle/header.rs new file mode 100644 index 0000000..6f3dfe3 --- /dev/null +++ b/src/bundle/header.rs @@ -0,0 +1,365 @@ +// Copyright © 2022 Kim Altintop +// SPDX-License-Identifier: GPL-2.0-only WITH openvpn-openssl-exception + +use core::fmt; +use std::{ + collections::{ + BTreeMap, + BTreeSet, + }, + io, + ops::Deref, + str::FromStr, +}; + +use hex::{ + FromHex, + FromHexError, +}; +use refs::Refname; +use sha2::{ + Digest, + Sha256, +}; + +use super::error; +use crate::{ + git::refs, + io::Lines, +}; + +pub const SIGNATURE_V2: &str = "# v2 git bundle"; +pub const SIGNATURE_V3: &str = "# v3 git bundle"; + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum Version { + V2, + V3, +} + +impl Default for Version { + fn default() -> Self { + Self::V2 + } +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ObjectFormat { + Sha1, + Sha256, +} + +impl Default for ObjectFormat { + fn default() -> Self { + Self::Sha1 + } +} + +impl fmt::Display for ObjectFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + Self::Sha1 => "sha1", + Self::Sha256 => "sha256", + }) + } +} + +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, serde::Serialize, serde::Deserialize)] +#[serde(untagged)] +pub enum ObjectId { + Sha1(#[serde(with = "hex::serde")] [u8; 20]), + Sha2(#[serde(with = "hex::serde")] [u8; 32]), +} + +impl ObjectId { + pub fn as_bytes(&self) -> &[u8] { + self.as_ref() + } +} + +impl AsRef<[u8]> for ObjectId { + fn as_ref(&self) -> &[u8] { + match self { + Self::Sha1(b) => &b[..], + Self::Sha2(b) => &b[..], + } + } +} + +impl fmt::Display for ObjectId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&hex::encode(self)) + } +} + +impl fmt::Debug for ObjectId { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Sha1(x) => f.debug_tuple("Sha1").field(&hex::encode(x)).finish(), + Self::Sha2(x) => f.debug_tuple("Sha2").field(&hex::encode(x)).finish(), + } + } +} + +impl FromHex for ObjectId { + type Error = hex::FromHexError; + + #[inline] + fn from_hex>(hex: T) -> Result { + match hex.as_ref().len() { + 40 => Ok(Self::Sha1(<[u8; 20]>::from_hex(hex)?)), + 64 => Ok(Self::Sha2(<[u8; 32]>::from_hex(hex)?)), + _ => Err(hex::FromHexError::InvalidStringLength), + } + } +} + +impl From<&git2::Oid> for ObjectId { + fn from(oid: &git2::Oid) -> Self { + let bs = oid.as_bytes(); + match bs.len() { + 20 => Self::Sha1(bs.try_into().unwrap()), + 32 => Self::Sha2(bs.try_into().unwrap()), + x => unreachable!("oid with strange hash size: {}", x), + } + } +} + +impl TryFrom<&ObjectId> for git2::Oid { + type Error = git2::Error; + + fn try_from(oid: &ObjectId) -> Result { + match oid { + ObjectId::Sha1(hash) => Self::from_bytes(hash), + ObjectId::Sha2(_) => Err(git2::Error::new( + git2::ErrorCode::Invalid, + git2::ErrorClass::Sha1, + "sha2 oids not yet supported", + )), + } + } +} + +#[derive(Debug, Default, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +pub struct Header { + pub version: Version, + pub object_format: ObjectFormat, + pub prerequisites: BTreeSet, + pub references: BTreeMap, +} + +impl Header { + /// Parse a [`Header`] from an IO stream. + /// + /// The stream will be buffered internally, and its position set to the + /// start of the packfile section. + pub fn from_reader(mut io: R) -> Result + where + R: io::Read + io::Seek, + { + use hex::FromHex as _; + + let mut lines = Lines::new(io::BufReader::new(&mut io)).until_blank(); + + let mut version: Option = None; + let mut object_format: Option = None; + let mut prerequisites = BTreeSet::new(); + let mut references = BTreeMap::new(); + + match lines + .next() + .ok_or(error::Header::Format("empty input"))?? + .as_str() + { + SIGNATURE_V2 => { + version = Some(Version::V2); + object_format = Some(ObjectFormat::Sha1); + Ok(()) + }, + + SIGNATURE_V3 => { + version = Some(Version::V2); + Ok(()) + }, + + _ => Err(error::Header::Format("invalid signature")), + }?; + + if let Some(Version::V3) = version { + for capability in lines.by_ref() { + let capability = capability?; + + if !capability.starts_with('@') { + return Err(error::Header::Format("expected capabilities")); + } + + if capability.starts_with("@filter") { + return Err(error::Header::Format("object filters are not supported")); + } + + match capability.strip_prefix("@object-format=") { + Some("sha1") => { + object_format = Some(ObjectFormat::Sha1); + }, + + Some("sha256") => { + object_format = Some(ObjectFormat::Sha256); + }, + + _ => return Err(error::Header::Format("unrecognised capability")), + } + + if object_format.is_some() { + break; + } + } + } + + let version = version.unwrap(); + let object_format = object_format.ok_or(error::Header::Format("missing object-format"))?; + + for tip in lines.by_ref() { + let mut tip = tip?; + let oid_off = usize::from(tip.starts_with('-')); + let oid_hexsz = match object_format { + ObjectFormat::Sha1 => 40, + ObjectFormat::Sha256 => 64, + }; + + let oid = ObjectId::from_hex(&tip[oid_off..oid_hexsz + oid_off])?; + if matches!( + (&object_format, &oid), + (ObjectFormat::Sha1, ObjectId::Sha2(_)) | (ObjectFormat::Sha256, ObjectId::Sha1(_)) + ) { + return Err(error::Header::ObjectFormat { + fmt: object_format, + oid, + }); + } + if !matches!(tip.chars().nth(oid_off + oid_hexsz), None | Some(' ')) { + return Err(error::Header::UnrecognisedHeader(tip)); + } + + if oid_off > 0 { + prerequisites.insert(oid); + } else { + let refname = tip.split_off(oid_off + oid_hexsz + 1); + if !refname.starts_with("refs/") { + return Err(error::Header::Format("shorthand refname")); + } + if references.insert(refname.parse()?, oid).is_some() { + return Err(error::Header::Format("duplicate refname")); + } + } + } + + if references.is_empty() { + return Err(error::Header::Format("empty references")); + } + + let pos = io::Seek::stream_position(&mut lines)?; + drop(lines); + io.seek(io::SeekFrom::Start(pos))?; + + Ok(Header { + version, + object_format, + prerequisites, + references, + }) + } + + pub fn to_writer(&self, mut io: W) -> io::Result<()> + where + W: io::Write, + { + match self.version { + Version::V2 => writeln!(&mut io, "{}", SIGNATURE_V2)?, + Version::V3 => { + writeln!(&mut io, "{}", SIGNATURE_V3)?; + match self.object_format { + ObjectFormat::Sha1 => writeln!(&mut io, "@object-format=sha1")?, + ObjectFormat::Sha256 => writeln!(&mut io, "@object-format=sha256")?, + } + }, + } + for pre in &self.prerequisites { + writeln!(&mut io, "-{}", pre)?; + } + for (name, oid) in &self.references { + writeln!(&mut io, "{} {}", oid, name)?; + } + + writeln!(&mut io) + } + + pub fn add_prerequisite(&mut self, oid: O) -> bool + where + O: Into, + { + self.prerequisites.insert(oid.into()) + } + + pub fn add_reference(&mut self, name: Refname, oid: O) -> Option + where + O: Into, + { + self.references.insert(name, oid.into()) + } + + pub fn hash(&self) -> Hash { + let mut ids: BTreeSet<&ObjectId> = BTreeSet::new(); + ids.extend(self.prerequisites.iter()); + ids.extend(self.references.values()); + + let mut sha = Sha256::new(); + for id in ids { + sha.update(id); + } + Hash(sha.finalize().into()) + } +} + +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, serde::Serialize, serde::Deserialize)] +pub struct Hash(#[serde(with = "hex::serde")] [u8; 32]); + +impl Hash { + pub fn as_bytes(&self) -> &[u8] { + self.deref() + } + + pub fn is_valid(hex: &str) -> bool { + Self::from_str(hex).is_ok() + } +} + +impl Deref for Hash { + type Target = [u8; 32]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl fmt::Display for Hash { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.write_str(&hex::encode(self.0)) + } +} + +impl fmt::Debug for Hash { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&hex::encode(self.0)) + } +} + +impl FromStr for Hash { + type Err = FromHexError; + + fn from_str(s: &str) -> Result { + <[u8; 32]>::from_hex(s).map(Self) + } +} diff --git a/src/bundle/list.rs b/src/bundle/list.rs new file mode 100644 index 0000000..21753fa --- /dev/null +++ b/src/bundle/list.rs @@ -0,0 +1,335 @@ +// Copyright © 2022 Kim Altintop +// SPDX-License-Identifier: GPL-2.0-only WITH openvpn-openssl-exception + +//! Bundle Lists in git config format, as per [`bundle-uri`]. +//! +//! [`bundle-uri`]: https://git.kernel.org/pub/scm/git/git.git/tree/Documentation/technical/bundle-uri.txt + +use std::{ + borrow::Cow, + cmp::Ordering, + collections::HashMap, + fmt, + io, + str::FromStr, + time::{ + SystemTime, + UNIX_EPOCH, + }, +}; + +use anyhow::anyhow; +use once_cell::sync::Lazy; +use sha2::{ + Digest, + Sha256, +}; +use url::Url; + +use crate::git::{ + self, + if_not_found_none, +}; + +pub const FILE_EXTENSION: &str = "uris"; +pub const DOT_FILE_EXTENSION: &str = ".uris"; + +#[derive(Clone, Copy, Debug)] +pub enum Mode { + All, + Any, +} + +impl Mode { + pub fn as_str(&self) -> &str { + match self { + Self::All => "all", + Self::Any => "any", + } + } +} + +impl fmt::Display for Mode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl FromStr for Mode { + type Err = crate::Error; + + fn from_str(s: &str) -> Result { + match s { + "all" => Ok(Self::All), + "any" => Ok(Self::Any), + x => Err(anyhow!("unknown bundle list mode: {x}")), + } + } +} + +#[derive(Debug)] +pub enum Uri { + Absolute(Url), + Relative(String), +} + +impl Uri { + pub fn as_str(&self) -> &str { + match self { + Self::Absolute(url) => url.as_str(), + Self::Relative(path) => path.as_str(), + } + } + + pub fn abs(&self, base: &Url) -> Result, url::ParseError> { + match self { + Self::Absolute(url) => Ok(Cow::Borrowed(url)), + Self::Relative(path) => base.join(path).map(Cow::Owned), + } + } +} + +impl From for Uri { + fn from(url: Url) -> Self { + Self::Absolute(url) + } +} + +impl FromStr for Uri { + type Err = url::ParseError; + + fn from_str(s: &str) -> Result { + static DUMMY_BASE: Lazy = + Lazy::new(|| Url::parse("https://bundles.example.com").unwrap()); + + Url::parse(s).map(Self::Absolute).or_else(|e| match e { + url::ParseError::RelativeUrlWithoutBase => { + let url = Url::options().base_url(Some(&DUMMY_BASE)).parse(s)?; + + let path = if s.starts_with('/') { + url.path() + } else { + url.path().trim_start_matches('/') + }; + + Ok(Self::Relative(path.to_owned())) + }, + other => Err(other), + }) + } +} + +#[derive(Debug)] +pub struct Location { + pub id: String, + pub uri: Uri, + pub filter: Option, + pub creation_token: Option, + pub location: Option, +} + +impl Location { + pub fn new(id: String, uri: Uri) -> Self { + Self { + id, + uri, + filter: None, + creation_token: None, + location: None, + } + } + + pub fn to_config(&self, cfg: &mut git2::Config) -> crate::Result<()> { + let section = format!("bundle.{}", self.id); + + cfg.set_str(&format!("{section}.uri"), self.uri.as_str())?; + if let Some(filter) = self.filter.as_deref() { + cfg.set_str(&format!("{section}.filter"), filter)?; + } + if let Some(token) = &self.creation_token { + cfg.set_str(&format!("{section}.creationToken"), &token.to_string())?; + } + if let Some(loc) = self.location.as_deref() { + cfg.set_str(&format!("{section}.location"), loc)?; + } + + Ok(()) + } + + pub fn to_writer(&self, mut out: W) -> io::Result<()> { + writeln!(&mut out, "[bundle \"{}\"]", self.id)?; + writeln!(&mut out, "\turi = {}", self.uri.as_str())?; + if let Some(filter) = self.filter.as_deref() { + writeln!(&mut out, "\tfilter = {}", filter)?; + } + if let Some(token) = &self.creation_token { + writeln!(&mut out, "\tcreationToken = {}", token)?; + } + if let Some(loc) = self.location.as_deref() { + writeln!(&mut out, "\tlocation = {}", loc)?; + } + + Ok(()) + } +} + +impl From for Location { + fn from(url: Url) -> Self { + let id = hex::encode(Sha256::digest(url.as_str())); + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("backwards system clock") + .as_secs(); + Self { + id, + uri: url.into(), + filter: None, + creation_token: Some(now), + location: None, + } + } +} + +#[derive(Debug)] +pub struct List { + pub mode: Mode, + pub heuristic: Option, + pub bundles: Vec, +} + +impl List { + pub fn any() -> Self { + Self { + mode: Mode::Any, + heuristic: Some("creationToken".into()), + bundles: Vec::new(), + } + } + + /// Parse a bundle list from a [`git2::Config`] + /// + /// The config is expected to contain the list config keys `bundle.mode` and + /// optionally `bundle.heuristic`. `bundle.version` is currently ignored. + /// + /// A bundle [`Location`] is yielded if at least `bundle..uri` is set + /// and a valid [`Url`]. The `base` [`Url`] must be provided to resolve + /// relative uris in the file. + /// + /// The [`Location`] list is sorted by creation token in descending order + /// (entries without a token sort last). The sort is unstable. + pub fn from_config(cfg: git::config::Snapshot) -> crate::Result { + // nb. ignoring version + let mode = cfg.get_str("bundle.mode")?.parse()?; + let heuristic = if_not_found_none(cfg.get_string("bundle.heuristic"))?; + + #[derive(Default)] + struct Info { + uri: Option, + filter: Option, + creation_token: Option, + location: Option, + } + + let mut bundles: HashMap = HashMap::new(); + let mut iter = cfg.entries(Some("bundle\\.[^.]+\\.[^.]+$"))?; + while let Some(entry) = iter.next() { + let entry = entry?; + if let Some(("bundle", id, key)) = entry + .name() + .and_then(|name| name.split_once('.')) + .and_then(|(a, b)| b.split_once('.').map(|(c, d)| (a, c, d))) + { + let value = entry + .value() + .ok_or_else(|| anyhow!("value for bundle.{id}.{key} not utf8"))?; + let info = bundles.entry(id.to_owned()).or_default(); + match key { + "uri" => { + let uri = value.parse()?; + info.uri = Some(uri); + }, + + "filter" => { + info.filter = Some(value.to_owned()); + }, + + "creationToken" | "creationtoken" => { + let token = value.parse()?; + info.creation_token = Some(token); + }, + + "location" => { + info.location = Some(value.to_owned()); + }, + + _ => {}, + } + } + } + let mut bundles = bundles + .into_iter() + .filter_map(|(id, info)| { + info.uri.map(|uri| Location { + id, + uri, + filter: info.filter, + creation_token: info.creation_token, + location: info.location, + }) + }) + .collect::>(); + bundles.sort_unstable_by(|a, b| match (&a.creation_token, &b.creation_token) { + (Some(x), Some(y)) => y.cmp(x), + (Some(_), None) => Ordering::Less, + (None, Some(_)) => Ordering::Greater, + (None, None) => Ordering::Equal, + }); + + Ok(Self { + mode, + heuristic, + bundles, + }) + } + + pub fn to_config(&self, cfg: &mut git2::Config) -> crate::Result<()> { + cfg.set_i32("bundle.version", 1)?; + cfg.set_str("bundle.mode", self.mode.as_str())?; + if let Some(heuristic) = self.heuristic.as_deref() { + cfg.set_str("bundle.heuristic", heuristic)?; + } + self.bundles.iter().try_for_each(|loc| loc.to_config(cfg))?; + + Ok(()) + } + + pub fn to_writer(&self, mut out: W) -> io::Result<()> { + writeln!(&mut out, "[bundle]")?; + writeln!(&mut out, "\tversion = 1")?; + writeln!(&mut out, "\tmode = {}", self.mode)?; + if let Some(heuristic) = self.heuristic.as_deref() { + writeln!(&mut out, "\theuristic = {}", heuristic)?; + } + for loc in &self.bundles { + writeln!(&mut out)?; + loc.to_writer(&mut out)?; + } + + Ok(()) + } + + pub fn to_str(&self) -> String { + let mut buf = Vec::new(); + self.to_writer(&mut buf).unwrap(); + unsafe { String::from_utf8_unchecked(buf) } + } +} + +impl Extend for List { + fn extend(&mut self, iter: T) + where + T: IntoIterator, + { + self.bundles.extend(iter) + } +} -- cgit v1.2.3