Someone asked me in slack so I figured I’d post some tricks that I use here, would love to hear the tricks of others! What follows is unofficial, opinionated, and maybe not even best practice.
aliases to add to .zshrc (use oh-my-zsh it’s dope)!
install_pl_dev() {
pip uninstall typing
pip install -U git+https://github.com/PyTorchLightning/pytorch-lightning.git
pip install typing
}
### pytest
hft () {
pytest -p no:warnings -n auto --dist=loadfile ./tests/ $@
}
tfork () {
cd ~/transformers_fork
}
tmar () {
RUN_SLOW=1 pytest --tb=short -p no:warnings ./tests/test_modeling_marian.py -ra $@
}
tmar_tok () {
RUN_SLOW=1 pytest --tb=short -p no:warnings ./tests/test_tokenization_marian.py -ra $@
}
tbart () {
#pytest -p no:warnings ./tests/test_modeling_bart.py -ra $@
pytest --tb=short -p no:warnings ./tests/test_modeling_bart.py -ra $@
}
ttf () {
pytest -p no:warnings ./tests/test_modeling_tf_bart.py -ra $@
}
tbm () {
pytest -p no:warnings ./tests/test_modeling_bart.py -ra $@
RUN_SLOW=1 pytest -p no:warnings tests/test_modeling_bart.py -sv -k mnli
}
tcnn () {
RUN_SLOW=1 pytest --tb=short -p no:warnings tests/test_modeling_bart.py -sv -k cnn $@
pytest -p no:warnings ./tests/test_modeling_bart.py -ra $@
}
txsum () {
RUN_SLOW=1 pytest --tb=short -p no:warnings tests/test_modeling_bart.py -sv -k xsum $@
# pytest -p no:warnings ./tests/test_modeling_bart.py -ra $@
}
tmbart () {
RUN_SLOW=1 pytest --tb=short -p no:warnings tests/test_modeling_bart.py -sv -k mbart $@
# pytest -p no:warnings ./tests/test_modeling_bart.py -ra $@
}
tenro() {
RUN_SLOW=1 pytest --tb=short -p no:warnings tests/test_modeling_bart.py -s -k enro $@
# pytest -p no:warnings ./tests/test_modeling_bart.py -ra $@
}
# misc
_checkout_grep() {
git checkout $1 > /dev/null 2>&1 # surpress Previous HEAD position msg
git grep $2 | wc -l
}
check_torch_compat () {
# check the pytorch compatibility of a function
# example usage check_torch_compat torch.bool
cd ~/pytorch/docs
echo "1.0"
_checkout_grep v1.0.0 $1
echo "1.1"
_checkout_grep v1.1.0 $1
echo "1.2"
_checkout_grep v1.2.0 $1
echo "1.3"
_checkout_grep v1.3.0 $1
echo "1.4"
_checkout_grep v1.4.0 $1
echo "master"
_checkout_grep master $1
cd - > /dev/null 2>&1
}
texamples () {
pytest --tb=short -p no:warnings examples/ $@
}
sty() {
make style
flake8 --ignore=P,E501,E203,W503,E741 examples templates tests src utils
}
gsync (){
g fetch upstream
g merge upstream/master
}
covg() {
open "$COVERAGE_URL$1"
}
# GCP STUFF
export CUR_PROJ="YOUR GCP PROJECT"
gcloud config set project $CUR_PROJ
start_gpu () {
gcloud compute instances start $CUR_INSTANCE_NAME --project $CUR_PROJ --zone $ZONE
}
stop_gpu () {
gcloud compute instances stop $CUR_INSTANCE_NAME --project $CUR_PROJ --zone $ZONE
}
export HF_PROJ="FIXME your gcp project name"
hfg_ssh () {
gcloud beta compute ssh --zone $ZONE $CUR_INSTANCE_NAME --project $CUR_PROJ -- -L 5555:localhost:8888
}
tidy_ssh () {
gcloud beta compute ssh --zone $ZONE $CUR_INSTANCE_NAME --project $CUR_PROJ
}
put_my_s3 () {
s3cmd put --recursive $1 s3://models.huggingface.co/bert/sshleifer/ $@
}
# Workon different machines then run hfg_ssh
workon_hfg (){
export CUR_INSTANCE_NAME="shleifer-MYSICKGPU"
export ZONE='us-central1-a'
}
workon_pegasus (){
export CUR_INSTANCE_NAME="notreally-pegasus-vm"
export ZONE="us-west1-b"
}
workon_tpu (){
export CUR_INSTANCE_NAME="shleifer-HUGETPUCLUSTERFORMAKEAGI"
export ZONE="us-central1-f"
}
workon_v8 (){
export CUR_INSTANCE_NAME="shleifer-BLAH"
export ZONE='us-central1-a'
}
start_v8 () {
workon_v8
start_gpu
}
export TOKENIZERS_PARALLELISM=false
export PYTEST_ADDOPTS='--pdbcls=IPython.terminal.debugger:Pdb'
### AWS/Seq2Seq Stuff
export COVERAGE_URL="https://codecov.io/gh/huggingface/transformers/src/master/src/transformers/"
export h="s3://models.huggingface.co/bert/Helsinki-NLP"
export b="s3://models.huggingface.co/bert"
export ss="s3://models.huggingface.co/bert/sshleifer"
export sdbart="s3://sshleifer.logs/dbart"
export sdir=$HOME/transformers_fork/examples/seq2seq/
export CNN_DIR=$sdir/dbart/cnn_dm
export XSUM_DIR=$sdir/dbart/xsum
export ENRO_DIR=$sdir/dbart/wmt_en_ro
export XSUM_URL="http://bollin.inf.ed.ac.uk/public/direct/XSUM-EMNLP18-Summary-Data-Original.tar.gz"
export XSUM_RAW_S3="s3://sshleifer.logs/dbart/XSUM-EMNLP18-Summary-Data-Original.tar.gz"
aw3 () {
aws s3 $@
}
s3ls () {
aws s3 ls $@
}
Misc-tips:
- fork, called that directory transformers_fork, and clone it to
$HOME/
on every machine. - use pip install -e .[“dev”] to keep up to date with dependency changes (isort, tokenizers mostly)
- every time you start a VM I put my dotfiles up there, either with scp or git. I use git for dotfiles and scp for ~/.ssh/
- When i want to update a branch, I usually run:
git checkout master
gsync # fetch upstream, merge upstream/master
git checkout <branch>
git merge master
If there are merge conflicts, I fix them in my IDE (vscode is nice, or pycharm cmd-k). I don’t trust git very much with this. The more you run this, the simpler it is to resolve merge conflicts.
Test Driven Development
(my version)
- I run
texamples -k finetune
a lot also and try to keep it always green if I am working on examples/seq2seq. mostly on my mac but also on my VM. - I also run
sty
, my make style, isort, flake8 alias, all the time. - When I am updating bart, or adding a new model. I write the tests first and then try to get them green one by one. Same with new feature. Test first, add feature. This often includes adding a new check to an existing check.
- I set tons of ipdb breakpoints for debugging hard things.