Skip to content

Commit 19f8e8c

Browse files
authored
feat(object_store): parse well-known storage urls (#3327)
* feat(object_store): add url parsing to azure builder * feat(object_store): add url parsing to aws builder * feat(object_store): add url parsing to gcs builder * feat(object_store): parse gcs service account from env * fix: typo * docs(object_store): fix example / template urls * feat(object_store): parse S3 virtually hosted urls * refactor: raise url parsing errors on build * fix: properly set virtual_hosted_style_request in url parsing
1 parent 75ef138 commit 19f8e8c

3 files changed

Lines changed: 300 additions & 3 deletions

File tree

object_store/src/aws/mod.rs

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ use std::ops::Range;
4242
use std::sync::Arc;
4343
use tokio::io::AsyncWrite;
4444
use tracing::info;
45+
use url::Url;
4546

4647
use crate::aws::client::{S3Client, S3Config};
4748
use crate::aws::credential::{
@@ -116,6 +117,18 @@ enum Error {
116117

117118
#[snafu(display("Received header containing non-ASCII data"))]
118119
BadHeader { source: reqwest::header::ToStrError },
120+
121+
#[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))]
122+
UnableToParseUrl {
123+
source: url::ParseError,
124+
url: String,
125+
},
126+
127+
#[snafu(display(
128+
"Unknown url scheme cannot be parsed into storage location: {}",
129+
scheme
130+
))]
131+
UnknownUrlScheme { scheme: String },
119132
}
120133

121134
impl From<Error> for super::Error {
@@ -359,6 +372,7 @@ pub struct AmazonS3Builder {
359372
metadata_endpoint: Option<String>,
360373
profile: Option<String>,
361374
client_options: ClientOptions,
375+
url_parse_error: Option<Error>,
362376
}
363377

364378
impl AmazonS3Builder {
@@ -430,6 +444,67 @@ impl AmazonS3Builder {
430444
builder
431445
}
432446

447+
/// Parse available connection info form a well-known storage URL.
448+
///
449+
/// The supported url schemes are:
450+
///
451+
/// - `s3://<bucket>/<path>`
452+
/// - `s3a://<bucket>/<path>`
453+
/// - `https://s3.<bucket>.amazonaws.com`
454+
/// - `https://<bucket>.s3.<region>.amazonaws.com`
455+
///
456+
/// Please note that this is a best effort implementation, and will not fail for malformed URLs,
457+
/// but rather warn and ignore the passed url. The url also has no effect on how the
458+
/// storage is accessed - e.g. which driver or protocol is used for reading from the location.
459+
///
460+
/// # Example
461+
/// ```
462+
/// use object_store::aws::AmazonS3Builder;
463+
///
464+
/// let s3 = AmazonS3Builder::from_env()
465+
/// .with_url("s3://bucket/path")
466+
/// .build();
467+
/// ```
468+
pub fn with_url(mut self, url: impl AsRef<str>) -> Self {
469+
let maybe_parsed = Url::parse(url.as_ref());
470+
match maybe_parsed {
471+
Ok(parsed) => match parsed.scheme() {
472+
"s3" | "s3a" => {
473+
self.bucket_name = parsed.host_str().map(|host| host.to_owned());
474+
}
475+
"https" => {
476+
if let Some(host) = parsed.host_str() {
477+
let parts = host.splitn(4, '.').collect::<Vec<&str>>();
478+
if parts.len() == 4 && parts[0] == "s3" && parts[2] == "amazonaws"
479+
{
480+
self.bucket_name = Some(parts[1].to_string());
481+
}
482+
if parts.len() == 4
483+
&& parts[1] == "s3"
484+
&& parts[3] == "amazonaws.com"
485+
{
486+
self.bucket_name = Some(parts[0].to_string());
487+
self.region = Some(parts[2].to_string());
488+
self.virtual_hosted_style_request = true;
489+
}
490+
}
491+
}
492+
other => {
493+
self.url_parse_error = Some(Error::UnknownUrlScheme {
494+
scheme: other.into(),
495+
});
496+
}
497+
},
498+
Err(err) => {
499+
self.url_parse_error = Some(Error::UnableToParseUrl {
500+
source: err,
501+
url: url.as_ref().into(),
502+
});
503+
}
504+
};
505+
self
506+
}
507+
433508
/// Set the AWS Access Key (required)
434509
pub fn with_access_key_id(mut self, access_key_id: impl Into<String>) -> Self {
435510
self.access_key_id = Some(access_key_id.into());
@@ -567,6 +642,10 @@ impl AmazonS3Builder {
567642
/// Create a [`AmazonS3`] instance from the provided values,
568643
/// consuming `self`.
569644
pub fn build(self) -> Result<AmazonS3> {
645+
if let Some(err) = self.url_parse_error {
646+
return Err(err.into());
647+
}
648+
570649
let bucket = self.bucket_name.context(MissingBucketNameSnafu)?;
571650
let region = self.region.context(MissingRegionSnafu)?;
572651

@@ -642,8 +721,8 @@ impl AmazonS3Builder {
642721
let endpoint: String;
643722
let bucket_endpoint: String;
644723

645-
//If `endpoint` is provided then its assumed to be consistent with
646-
// `virutal_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then
724+
// If `endpoint` is provided then its assumed to be consistent with
725+
// `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then
647726
// `endpoint` should have bucket name included.
648727
if self.virtual_hosted_style_request {
649728
endpoint = self.endpoint.unwrap_or_else(|| {
@@ -940,4 +1019,18 @@ mod tests {
9401019
err
9411020
);
9421021
}
1022+
1023+
#[test]
1024+
fn s3_test_urls() {
1025+
let builder = AmazonS3Builder::new().with_url("s3://bucket/path");
1026+
assert_eq!(builder.bucket_name, Some("bucket".to_string()));
1027+
1028+
let builder = AmazonS3Builder::new().with_url("https://s3.bucket.amazonaws.com");
1029+
assert_eq!(builder.bucket_name, Some("bucket".to_string()));
1030+
1031+
let builder =
1032+
AmazonS3Builder::new().with_url("https://bucket.s3.region.amazonaws.com");
1033+
assert_eq!(builder.bucket_name, Some("bucket".to_string()));
1034+
assert_eq!(builder.region, Some("region".to_string()))
1035+
}
9431036
}

object_store/src/azure/mod.rs

Lines changed: 110 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ enum Error {
114114

115115
#[snafu(display("Azure credential error: {}", source), context(false))]
116116
Credential { source: credential::Error },
117+
118+
#[snafu(display(
119+
"Unknown url scheme cannot be parsed into storage location: {}",
120+
scheme
121+
))]
122+
UnknownUrlScheme { scheme: String },
117123
}
118124

119125
impl From<Error> for super::Error {
@@ -361,6 +367,7 @@ pub struct MicrosoftAzureBuilder {
361367
use_emulator: bool,
362368
retry_config: RetryConfig,
363369
client_options: ClientOptions,
370+
url_parse_error: Option<Error>,
364371
}
365372

366373
impl Debug for MicrosoftAzureBuilder {
@@ -379,7 +386,7 @@ impl MicrosoftAzureBuilder {
379386
Default::default()
380387
}
381388

382-
/// Create an instance of [MicrosoftAzureBuilder] with values pre-populated from environment variables.
389+
/// Create an instance of [`MicrosoftAzureBuilder`] with values pre-populated from environment variables.
383390
///
384391
/// Variables extracted from environment:
385392
/// * AZURE_STORAGE_ACCOUNT_NAME: storage account name
@@ -424,6 +431,78 @@ impl MicrosoftAzureBuilder {
424431
builder
425432
}
426433

434+
/// Parse available connection info form a well-known storage URL.
435+
///
436+
/// The supported url schemes are:
437+
///
438+
/// - `abfs[s]://<container>/<path>` (according to [fsspec](https://github.com/fsspec/adlfs))
439+
/// - `abfs[s]://<file_system>@<account_name>.dfs.core.windows.net/<path>`
440+
/// - `az://<container>/<path>` (according to [fsspec](https://github.com/fsspec/adlfs))
441+
/// - `adl://<container>/<path>` (according to [fsspec](https://github.com/fsspec/adlfs))
442+
/// - `azure://<container>/<path>` (custom)
443+
/// - `https://<account>.dfs.core.windows.net`
444+
/// - `https://<account>.blob.core.windows.net`
445+
///
446+
/// Please note that this is a best effort implementation, and will not fail for malformed URLs,
447+
/// but rather warn and ignore the passed url. The url also has no effect on how the
448+
/// storage is accessed - e.g. which driver or protocol is used for reading from the location.
449+
///
450+
/// # Example
451+
/// ```
452+
/// use object_store::azure::MicrosoftAzureBuilder;
453+
///
454+
/// let azure = MicrosoftAzureBuilder::from_env()
455+
/// .with_url("abfss://file_system@account.dfs.core.windows.net/")
456+
/// .build();
457+
/// ```
458+
pub fn with_url(mut self, url: impl AsRef<str>) -> Self {
459+
let maybe_parsed = Url::parse(url.as_ref());
460+
match maybe_parsed {
461+
Ok(parsed) => match parsed.scheme() {
462+
"az" | "adl" | "azure" => {
463+
self.container_name = parsed.host_str().map(|host| host.to_owned());
464+
}
465+
"abfs" | "abfss" => {
466+
// abfs(s) might refer to the fsspec convention abfs://<container>/<path>
467+
// or the convention for the hadoop driver abfs[s]://<file_system>@<account_name>.dfs.core.windows.net/<path>
468+
if parsed.username().is_empty() {
469+
self.container_name =
470+
parsed.host_str().map(|host| host.to_owned());
471+
} else if let Some(host) = parsed.host_str() {
472+
let parts = host.splitn(2, '.').collect::<Vec<&str>>();
473+
if parts.len() == 2 && parts[1] == "dfs.core.windows.net" {
474+
self.container_name = Some(parsed.username().to_owned());
475+
self.account_name = Some(parts[0].to_string());
476+
}
477+
}
478+
}
479+
"https" => {
480+
if let Some(host) = parsed.host_str() {
481+
let parts = host.splitn(2, '.').collect::<Vec<&str>>();
482+
if parts.len() == 2
483+
&& (parts[1] == "dfs.core.windows.net"
484+
|| parts[1] == "blob.core.windows.net")
485+
{
486+
self.account_name = Some(parts[0].to_string());
487+
}
488+
}
489+
}
490+
other => {
491+
self.url_parse_error = Some(Error::UnknownUrlScheme {
492+
scheme: other.into(),
493+
});
494+
}
495+
},
496+
Err(err) => {
497+
self.url_parse_error = Some(Error::UnableToParseUrl {
498+
source: err,
499+
url: url.as_ref().into(),
500+
});
501+
}
502+
};
503+
self
504+
}
505+
427506
/// Set the Azure Account (required)
428507
pub fn with_account(mut self, account: impl Into<String>) -> Self {
429508
self.account_name = Some(account.into());
@@ -529,8 +608,13 @@ impl MicrosoftAzureBuilder {
529608
retry_config,
530609
authority_host,
531610
mut client_options,
611+
url_parse_error,
532612
} = self;
533613

614+
if let Some(err) = url_parse_error {
615+
return Err(err.into());
616+
}
617+
534618
let container = container_name.ok_or(Error::MissingContainerName {})?;
535619

536620
let (is_emulator, storage_url, auth, account) = if use_emulator {
@@ -716,4 +800,29 @@ mod tests {
716800
copy_if_not_exists(&integration).await;
717801
stream_get(&integration).await;
718802
}
803+
804+
#[test]
805+
fn azure_blob_test_urls() {
806+
let builder = MicrosoftAzureBuilder::new()
807+
.with_url("abfss://file_system@account.dfs.core.windows.net/");
808+
assert_eq!(builder.account_name, Some("account".to_string()));
809+
assert_eq!(builder.container_name, Some("file_system".to_string()));
810+
811+
let builder = MicrosoftAzureBuilder::new().with_url("abfs://container/path");
812+
assert_eq!(builder.container_name, Some("container".to_string()));
813+
814+
let builder = MicrosoftAzureBuilder::new().with_url("az://container");
815+
assert_eq!(builder.container_name, Some("container".to_string()));
816+
817+
let builder = MicrosoftAzureBuilder::new().with_url("az://container/path");
818+
assert_eq!(builder.container_name, Some("container".to_string()));
819+
820+
let builder = MicrosoftAzureBuilder::new()
821+
.with_url("https://account.dfs.core.windows.net/");
822+
assert_eq!(builder.account_name, Some("account".to_string()));
823+
824+
let builder = MicrosoftAzureBuilder::new()
825+
.with_url("https://account.blob.core.windows.net/");
826+
assert_eq!(builder.account_name, Some("account".to_string()))
827+
}
719828
}

0 commit comments

Comments
 (0)