Use refgenie in your pipeline

The code snippets below can be used in your pipeline to assert the existence of the refgenie-managed files in 3 different languages: Bash, Python and R.

Refgenie checks if the asset is available locally and tries pull it from the server if it's not.

The only step that needs to precede the execution of these functions is refgenie genome configuration file initialization:

export REFGNEIE=refgenie_config.yaml
refgenie init -c $REFGENIE

Bash

Requirements:

  • Python package refgenie
#!/bin/bash

assert_refgenie_asset_exists(){

    RED='\033[0;31m'
    GREEN='\033[0;32m'
    YELLOW='\033[0;33m'
    NC='\033[0m'

    if [ -z "$1" ]; then
        echo -e "\n${RED}Asset registry path not provided!${NC}\n"
        exit 1
    fi

    # check if refgenie env var is defined
    if [ -z "$REFGENIE" ]
    then
        echo -e "${RED}refgenie env var not defined."
        echo -e "Run 'export REFGENIE=<path to refgenie config>' to set the env var.${NC}"
        exit 1
    else
        echo -e "${GREEN}refgenie env var defined: $REFGENIE${NC}"
    fi

    # check if asset is available locally
    if file_path=`refgenie seek $1`; then
        echo -e "${GREEN}Found ($1) asset: $file_path${NC}"
    else
        # pull if not available locally
        echo -e "${YELLOW}Asset ($1) not found, pulling...${NC}"
        refgenie pull $1
        if file_path=`refgenie seek $1`; then
            echo -e "${GREEN}Asset ($1) pulled successfully: $file_path${NC}"
        else
            echo -e "${RED}Asset ($1) pull failed${NC}"
            exit 1
        fi
    fi
}


# Run like this: assert_refgenie_asset_exists hg38/fasta

Python

Requirements:

  • Python package refgenconf
from refgenconf import RefGenConf

def assert_refgenie_asset_exists(
    genome, asset, tag=None, seek_key=None, refgenie_config=None
):
    # instantiate RefGenConf object
    rgc = RefGenConf(filepath=refgenie_config)

    # get tag of interest, provided vs. default
    tag = tag if tag is not None else rgc.get_default_tag(genome=genome, asset=asset)

    # list assets available locally
    list_result = rgc.list()

    # check whether the asset of interest is missing
    if genome not in list_result.keys() or asset not in list_result[genome]:
        # pull asset if missing
        print(f"{genome}/{asset}:{tag} not found, pulling...")
        try:
            rgc.pull(genome=genome, asset=asset, tag=tag)
        except Exception as e:
            print(f"Pull failed")
            raise

    # get the local path to the asset of interest
    rgc.seek(genome=genome, asset=asset, tag=tag, seek_key=seek_key)


# Run like this: assert_refgenie_asset_exists(
#     genome="hg38",
#     asset="fasta",
# )

R

Requirements:

  • Python package refgenconf
  • R package reticulate
library('reticulate')

assertRefgenieAssetExists <-
  function(genome,
           asset,
           tag = NULL,
           seek_key = NULL,
           refgenieConfig = NULL) {

    # import Python module
    refgenconf = reticulate::import("refgenconf", convert = FALSE)

    # determine refgenie config path, provided vs. read from env
    refgenieConfig = ifelse(is.null(refgenieConfig),
                            Sys.getenv("REFGENIE"),
                            refgenieConfig)

    # instantiate Python RefGenConf object
    rgc = refgenconf$RefGenConf(filepath = refgenieConfig)

    # get tag of interest, provided vs. default
    tag = ifelse(is.null(tag),
                 py_to_r(rgc$get_default_tag(genome = genome, asset = asset)),
                 tag)

    # string together the final asset registry path, for logging
    assetRegistryPath = paste0(genome, "/" , asset, ":",  tag)

    # list assets available locally
    listResult = py_to_r(rgc$list())

    # check whether the asset of interest is missing
    if (is.null(listResult[[genome]]) |
        !any(listResult[[genome]] == asset)) {
      # pull asset if missing
      message(paste0(assetRegistryPath, " not found, pulling..."))
      pullResult = py_to_r(rgc$pull(
        genome = genome,
        asset = asset,
        tag = tag,
        force = TRUE,
        force_large = TRUE
      ))
    }

    # get the local path to the asset of interest
    seekResult = rgc$seek(
      genome_name = genome,
      asset_name = asset,
      tag_name = tag,
      seek_key = seek_key
    )
  }


# Run like this: assertRefgenieAssetExists(
#     genome="hg38",
#     asset="fasta",
# )