Unverified Commit 7efebca8 authored by Winter's avatar Winter Committed by Lily Foster
Browse files

prefetch-npm-deps: fix reproducibility

v1 lockfiles can contain multiple references to the same version of a
package, and these references can contain different `integrity` values,
such as one having SHA-1 and SHA-512, while another just has SHA-512.

Given that HashMap iteration order isn't defined, this causes
reproducibility issues, as a different integrity value could be chosen
each time.

Thanks to @lilyinstarlight for discovering this issue originally, as well
as the idea for the sorting-based implementation.
parent d6b863fd
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -105,7 +105,7 @@ fn main() -> anyhow::Result<()> {
        eprintln!("{}", package.name);

        let tarball = package.tarball()?;
        let integrity = package.integrity();
        let integrity = package.integrity().map(ToString::to_string);

        cache
            .put(
+141 −7
Original line number Diff line number Diff line
use anyhow::{bail, Context};
use anyhow::{anyhow, bail, Context};
use rayon::slice::ParallelSliceMut;
use serde::Deserialize;
use std::{collections::HashMap, fmt};
use serde::{
    de::{self, Visitor},
    Deserialize, Deserializer,
};
use std::{
    cmp::Ordering,
    collections::{HashMap, HashSet},
    fmt,
};
use url::Url;

pub(super) fn packages(content: &str) -> anyhow::Result<Vec<Package>> {
@@ -33,6 +40,13 @@ pub(super) fn packages(content: &str) -> anyhow::Result<Vec<Package>> {
        x.resolved
            .partial_cmp(&y.resolved)
            .expect("resolved should be comparable")
            .then(
                // v1 lockfiles can contain multiple references to the same version of a package, with
                // different integrity values (e.g. a SHA-1 and a SHA-512 in one, but just a SHA-512 in another)
                y.integrity
                    .partial_cmp(&x.integrity)
                    .expect("integrity should be comparable"),
            )
    });

    packages.dedup_by(|x, y| x.resolved == y.resolved);
@@ -54,7 +68,7 @@ struct OldPackage {
    #[serde(default)]
    bundled: bool,
    resolved: Option<UrlOrString>,
    integrity: Option<String>,
    integrity: Option<HashCollection>,
    dependencies: Option<HashMap<String, OldPackage>>,
}

@@ -63,7 +77,7 @@ pub(super) struct Package {
    #[serde(default)]
    pub(super) name: Option<String>,
    pub(super) resolved: Option<UrlOrString>,
    pub(super) integrity: Option<String>,
    pub(super) integrity: Option<HashCollection>,
}

#[derive(Debug, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
@@ -82,6 +96,102 @@ impl fmt::Display for UrlOrString {
    }
}

#[derive(Debug, PartialEq, Eq)]
pub(super) struct HashCollection(HashSet<Hash>);

impl HashCollection {
    pub(super) fn into_best(self) -> Option<Hash> {
        self.0.into_iter().max()
    }
}

impl PartialOrd for HashCollection {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        let lhs = self.0.iter().max()?;
        let rhs = other.0.iter().max()?;

        lhs.partial_cmp(rhs)
    }
}

impl<'de> Deserialize<'de> for HashCollection {
    fn deserialize<D>(deserializer: D) -> Result<HashCollection, D::Error>
    where
        D: Deserializer<'de>,
    {
        deserializer.deserialize_string(HashCollectionVisitor)
    }
}

struct HashCollectionVisitor;

impl<'de> Visitor<'de> for HashCollectionVisitor {
    type Value = HashCollection;

    fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
        formatter.write_str("a single SRI hash or a collection of them (separated by spaces)")
    }

    fn visit_str<E>(self, value: &str) -> Result<HashCollection, E>
    where
        E: de::Error,
    {
        let hashes = value
            .split_ascii_whitespace()
            .map(Hash::new)
            .collect::<anyhow::Result<_>>()
            .map_err(E::custom)?;

        Ok(HashCollection(hashes))
    }
}

#[derive(Debug, Deserialize, PartialEq, Eq, Hash)]
pub struct Hash(String);

// Hash algorithms, in ascending preference.
const ALGOS: &[&str] = &["sha1", "sha512"];

impl Hash {
    fn new(s: impl AsRef<str>) -> anyhow::Result<Hash> {
        let algo = s
            .as_ref()
            .split_once('-')
            .ok_or_else(|| anyhow!("expected SRI hash, got {:?}", s.as_ref()))?
            .0;

        if ALGOS.iter().any(|&a| algo == a) {
            Ok(Hash(s.as_ref().to_string()))
        } else {
            Err(anyhow!("unknown hash algorithm {algo:?}"))
        }
    }
}

impl fmt::Display for Hash {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        self.0.fmt(f)
    }
}

impl PartialOrd for Hash {
    fn partial_cmp(&self, other: &Hash) -> Option<Ordering> {
        let lhs = self.0.split_once('-')?.0;
        let rhs = other.0.split_once('-')?.0;

        ALGOS
            .iter()
            .position(|&s| lhs == s)?
            .partial_cmp(&ALGOS.iter().position(|&s| rhs == s)?)
    }
}

impl Ord for Hash {
    fn cmp(&self, other: &Hash) -> Ordering {
        self.partial_cmp(other).unwrap()
    }
}

#[allow(clippy::case_sensitive_file_extension_comparisons)]
fn to_new_packages(
    old_packages: HashMap<String, OldPackage>,
@@ -149,8 +259,13 @@ fn get_initial_url() -> anyhow::Result<Url> {

#[cfg(test)]
mod tests {
    use super::{get_initial_url, to_new_packages, OldPackage, Package, UrlOrString};
    use std::collections::HashMap;
    use super::{
        get_initial_url, to_new_packages, Hash, HashCollection, OldPackage, Package, UrlOrString,
    };
    use std::{
        cmp::Ordering,
        collections::{HashMap, HashSet},
    };
    use url::Url;

    #[test]
@@ -188,4 +303,23 @@ mod tests {

        Ok(())
    }

    #[test]
    fn hash_preference() {
        assert_eq!(
            Hash(String::from("sha1-foo")).partial_cmp(&Hash(String::from("sha512-foo"))),
            Some(Ordering::Less)
        );

        assert_eq!(
            HashCollection({
                let mut set = HashSet::new();
                set.insert(Hash(String::from("sha512-foo")));
                set.insert(Hash(String::from("sha1-bar")));
                set
            })
            .into_best(),
            Some(Hash(String::from("sha512-foo")))
        );
    }
}
+9 −39
Original line number Diff line number Diff line
@@ -87,7 +87,7 @@ pub struct Package {

#[derive(Debug)]
enum Specifics {
    Registry { integrity: String },
    Registry { integrity: lock::Hash },
    Git { workdir: TempDir },
}

@@ -134,11 +134,11 @@ impl Package {
                Specifics::Git { workdir }
            }
            None => Specifics::Registry {
                integrity: get_ideal_hash(
                    &pkg.integrity
                        .expect("non-git dependencies should have assosciated integrity"),
                )?
                .to_string(),
                integrity: pkg
                    .integrity
                    .expect("non-git dependencies should have assosciated integrity")
                    .into_best()
                    .expect("non-git dependencies should have non-empty assosciated integrity"),
            },
        };

@@ -181,9 +181,9 @@ impl Package {
        }
    }

    pub fn integrity(&self) -> Option<String> {
    pub fn integrity(&self) -> Option<&lock::Hash> {
        match &self.specifics {
            Specifics::Registry { integrity } => Some(integrity.clone()),
            Specifics::Registry { integrity } => Some(integrity),
            Specifics::Git { .. } => None,
        }
    }
@@ -304,25 +304,9 @@ fn get_hosted_git_url(url: &Url) -> anyhow::Result<Option<Url>> {
    }
}

fn get_ideal_hash(integrity: &str) -> anyhow::Result<&str> {
    let split: Vec<_> = integrity.split_ascii_whitespace().collect();

    if split.len() == 1 {
        Ok(split[0])
    } else {
        for hash in ["sha512-", "sha1-"] {
            if let Some(h) = split.iter().find(|s| s.starts_with(hash)) {
                return Ok(h);
            }
        }

        Err(anyhow!("not sure which hash to select out of {split:?}"))
    }
}

#[cfg(test)]
mod tests {
    use super::{get_hosted_git_url, get_ideal_hash};
    use super::get_hosted_git_url;
    use url::Url;

    #[test]
@@ -353,18 +337,4 @@ mod tests {
            "GitLab URLs should be marked as invalid (lol)"
        );
    }

    #[test]
    fn ideal_hashes() {
        for (input, expected) in [
            ("sha512-foo sha1-bar", Some("sha512-foo")),
            ("sha1-bar md5-foo", Some("sha1-bar")),
            ("sha1-bar", Some("sha1-bar")),
            ("sha512-foo", Some("sha512-foo")),
            ("foo-bar sha1-bar", Some("sha1-bar")),
            ("foo-bar baz-foo", None),
        ] {
            assert_eq!(get_ideal_hash(input).ok(), expected);
        }
    }
}