Turn on linting for functorch (#81987)

Test Plan:
- wait for CI
Pull Request resolved: https://github.com/pytorch/pytorch/pull/81987
Approved by: https://github.com/samdow
diff --git a/.flake8 b/.flake8
index 9ef3b95..75abc4d 100644
--- a/.flake8
+++ b/.flake8
@@ -22,8 +22,9 @@
     ./docs/caffe2,
     ./docs/cpp/src,
     ./docs/src,
-    # See NOTE: [Impending functorch move]
-    ./functorch,
+    ./functorch/docs,
+    ./functorch/examples,
+    ./functorch/notebooks,
     ./scripts,
     ./test/generated_type_hints_smoketest.py,
     ./third_party,
diff --git a/.lintrunner.toml b/.lintrunner.toml
index fa5bf45..0fdd362 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -9,11 +9,9 @@
     'docs/caffe2/**',
     'docs/cpp/src/**',
     'docs/src/**',
-    # NOTE: [Impending functorch move]
-    # In preparation for the functorch -> pytorch merge,
-    # we are adding the following excludes so that functorch passes
-    # lint when it gets merged in. Please don't delete.
-    'functorch/**',
+    'functorch/docs/**',
+    'functorch/examples/**',
+    'functorch/notebooks/**',
     'scripts/**',
     'test/generated_type_hints_smoketest.py',
     'third_party/**',
@@ -227,8 +225,6 @@
 include_patterns = ['**/*.py', '**/*.pyi']
 exclude_patterns = [
     'test/test_jit.py',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
@@ -301,8 +297,6 @@
     'tools/clang_format_hash/**',
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
@@ -322,8 +316,6 @@
     'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
@@ -353,8 +345,6 @@
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
     '.lintrunner.toml',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
@@ -436,8 +426,6 @@
     '**/git-pre-commit',
     '**/git-clang-format',
     '**/gradlew',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
diff --git a/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh b/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh
index d1ed415..7db3137 100644
--- a/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh
+++ b/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh
@@ -4,7 +4,7 @@
 echo CU_VERSION is "${CU_VERSION}"
 echo CUDA_VERSION is "${CUDA_VERSION}"
 
-# Currenly, CU_VERSION and CUDA_VERSION are not consistent. 
+# Currenly, CU_VERSION and CUDA_VERSION are not consistent.
 # to understand this code, see https://github.com/pytorch/vision/issues/4443
 version="cpu"
 if [[ ! -z "${CUDA_VERSION}" ]] ; then
diff --git a/functorch/benchmarks/chrome_trace_parser.py b/functorch/benchmarks/chrome_trace_parser.py
index f07a159..54d2bf1 100755
--- a/functorch/benchmarks/chrome_trace_parser.py
+++ b/functorch/benchmarks/chrome_trace_parser.py
@@ -21,7 +21,7 @@
     return modelname
 
 def get_total_length(run_times_df, modelname):
-    return float(run_times_df[run_times_df["name"]==modelname]["runtime"])
+    return float(run_times_df[run_times_df["name"] == modelname]["runtime"])
 
 
 def main():
@@ -51,16 +51,16 @@
     else:
         print("Please provide a filename or a folder name")
 
-    print(f"modelname, GPU Utilization, MM and Conv time")
+    print("modelname, GPU Utilization, MM and Conv time")
 
-    run_times_df = pd.read_csv(args.runtime)   
+    run_times_df = pd.read_csv(args.runtime)
     for filename in filenames:
         try:
             modelname = get_model_name(filename)
             total_length = get_total_length(run_times_df, modelname) * 1e6
             utilization, mm_conv_utilization = compute_utilization(filenames, total_length)
             print(f"{modelname}, {utilization}, {mm_conv_utilization}")
-        except:
+        except BaseException:
             logging.exception(f"{filename}, ERROR")
             print(f"{filename}, ERROR")
 
diff --git a/functorch/benchmarks/per_sample_grads.py b/functorch/benchmarks/per_sample_grads.py
index c5d911b..e9e3524 100644
--- a/functorch/benchmarks/per_sample_grads.py
+++ b/functorch/benchmarks/per_sample_grads.py
@@ -2,11 +2,8 @@
 import torch.nn as nn
 import torchvision.models as models
 from opacus.utils.module_modification import convert_batchnorm_modules
-from torchvision.datasets import CIFAR10
 import time
 
-from functools import partial
-import functorch
 from functorch import vmap, grad
 from functorch import make_functional
 from opacus import PrivacyEngine
diff --git a/functorch/benchmarks/process_scorecard.py b/functorch/benchmarks/process_scorecard.py
index 7c17e80..f95d879 100644
--- a/functorch/benchmarks/process_scorecard.py
+++ b/functorch/benchmarks/process_scorecard.py
@@ -8,7 +8,7 @@
 pivot_op_shape = df.pivot_table(values="time", index=["operator", "shape"], columns=["fuser"])
 pivot_speedups = (pivot_op_shape.T / pivot_op_shape["eager"]).T
 
-plt.rcParams["figure.figsize"] = (20,100)
+plt.rcParams["figure.figsize"] = (20, 100)
 fig, axs = plt.subplots(nops)
 plt.subplots_adjust(hspace=0.5)
 for idx, op in enumerate(ops):
diff --git a/functorch/benchmarks/transformer_fusion_patterns/benchmark.py b/functorch/benchmarks/transformer_fusion_patterns/benchmark.py
index 1ffb9bc..a6646e1 100644
--- a/functorch/benchmarks/transformer_fusion_patterns/benchmark.py
+++ b/functorch/benchmarks/transformer_fusion_patterns/benchmark.py
@@ -1,5 +1,4 @@
 import torch
-import time
 from functorch.compile import memory_efficient_fusion, clear_compile_cache
 import benchmark_helper
 
diff --git a/functorch/benchmarks/transformer_fusion_patterns/benchmark_helper.py b/functorch/benchmarks/transformer_fusion_patterns/benchmark_helper.py
index 874b196..bad2757 100644
--- a/functorch/benchmarks/transformer_fusion_patterns/benchmark_helper.py
+++ b/functorch/benchmarks/transformer_fusion_patterns/benchmark_helper.py
@@ -63,7 +63,9 @@
     print("################################################\n\n\n\n")
 
 
-def time_with_torch_timer(fn, args, string_id, kwargs={}):
+def time_with_torch_timer(fn, args, string_id, kwargs=None):
+    if kwargs is None:
+        kwargs = {}
     print("################################################")
     print(f"#### Torch Timer for {string_id} starts #########")
     print("################################################")
diff --git a/functorch/benchmarks/transformer_fusion_patterns/bias_gelu_dropout.py b/functorch/benchmarks/transformer_fusion_patterns/bias_gelu_dropout.py
index 8126ec4..b231806 100644
--- a/functorch/benchmarks/transformer_fusion_patterns/bias_gelu_dropout.py
+++ b/functorch/benchmarks/transformer_fusion_patterns/bias_gelu_dropout.py
@@ -1,9 +1,8 @@
 import torch
-import time
 from functorch.compile import memory_efficient_pointwise_fusion, clear_compile_cache
 import benchmark_helper
 
-### ALL comments regarding the patetrns
+# ALL comments regarding the patetrns
 
 
 def bias_gelu_dropout(input, bias):
diff --git a/functorch/docs/source/_static/images/functorch.svg b/functorch/docs/source/_static/images/functorch.svg
index b1ac5cf..ec7d794 100644
--- a/functorch/docs/source/_static/images/functorch.svg
+++ b/functorch/docs/source/_static/images/functorch.svg
@@ -3,4 +3,4 @@
   font-family: Arial Black;
   dominant-baseline: central;
   text-anchor: middle;
-}</style></svg>
\ No newline at end of file
+}</style></svg>
diff --git a/functorch/docs/source/aot_autograd.rst b/functorch/docs/source/aot_autograd.rst
index da3da57..5123a35 100644
--- a/functorch/docs/source/aot_autograd.rst
+++ b/functorch/docs/source/aot_autograd.rst
@@ -40,4 +40,4 @@
     :nosignatures:
 
     nop
-    ts_compile
\ No newline at end of file
+    ts_compile
diff --git a/functorch/examples/dp_cifar10/cifar10_opacus.py b/functorch/examples/dp_cifar10/cifar10_opacus.py
index b16f2e9..bcd0aae 100644
--- a/functorch/examples/dp_cifar10/cifar10_opacus.py
+++ b/functorch/examples/dp_cifar10/cifar10_opacus.py
@@ -465,4 +465,4 @@
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/functorch/examples/maml_omniglot/.gitignore b/functorch/examples/maml_omniglot/.gitignore
index 1a2aff2..783c4e5 100644
--- a/functorch/examples/maml_omniglot/.gitignore
+++ b/functorch/examples/maml_omniglot/.gitignore
@@ -1,3 +1,2 @@
 omniglot/
 maml-accs.png
-
diff --git a/functorch/functorch/_src/vmap.py b/functorch/functorch/_src/vmap.py
index 9855335..1504107 100644
--- a/functorch/functorch/_src/vmap.py
+++ b/functorch/functorch/_src/vmap.py
@@ -121,7 +121,7 @@
         flat_in_dims: List[Any], flat_args: List[Any], vmap_level: int, args_spec) -> Tuple:
     # See NOTE [Ignored _remove_batch_dim, _add_batch_dim]
     batched_inputs = [arg if in_dim is None else
-                      _add_batch_dim(arg, in_dim, vmap_level)  # type: ignore
+                      _add_batch_dim(arg, in_dim, vmap_level)
                       for in_dim, arg in zip(flat_in_dims, flat_args)]
     return tree_unflatten(batched_inputs, args_spec)
 
diff --git a/functorch/functorch/csrc/BatchRulesDecompositions.cpp b/functorch/functorch/csrc/BatchRulesDecompositions.cpp
index d8d2591..3256847 100644
--- a/functorch/functorch/csrc/BatchRulesDecompositions.cpp
+++ b/functorch/functorch/csrc/BatchRulesDecompositions.cpp
@@ -259,4 +259,3 @@
 }
 
 }}
-
diff --git a/functorch/functorch/csrc/BatchRulesFactory.cpp b/functorch/functorch/csrc/BatchRulesFactory.cpp
index 97d8daf..a42583c 100644
--- a/functorch/functorch/csrc/BatchRulesFactory.cpp
+++ b/functorch/functorch/csrc/BatchRulesFactory.cpp
@@ -97,4 +97,3 @@
   // Not sure how to add the ones with irregular args to the mix cleanly (i.e. randint takes an extra int parameter)
 }
 }}
-
diff --git a/functorch/functorch/csrc/BatchRulesHelper.h b/functorch/functorch/csrc/BatchRulesHelper.h
index 263bcd2..2bacfeb 100644
--- a/functorch/functorch/csrc/BatchRulesHelper.h
+++ b/functorch/functorch/csrc/BatchRulesHelper.h
@@ -470,4 +470,3 @@
 }
 
 }}
-
diff --git a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
index 97228e4..d7286c5 100644
--- a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
+++ b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
@@ -216,4 +216,3 @@
   VARIADIC_BDIMS_BOXED(_lu_with_info);
 }
 }}
-
diff --git a/functorch/functorch/csrc/BatchRulesRandomness.cpp b/functorch/functorch/csrc/BatchRulesRandomness.cpp
index b4b8b6b..a4a9ef9 100644
--- a/functorch/functorch/csrc/BatchRulesRandomness.cpp
+++ b/functorch/functorch/csrc/BatchRulesRandomness.cpp
@@ -463,7 +463,7 @@
     decltype(&ATEN_FN2(randint_like, low_dtype)), &ATEN_FN2(randint_like, low_dtype), int64_t, int64_t, TENSOR_LIKE_COMMON_ARG_TYPES>);
   m.impl("rand_like", tensor_like_random_batch_rule<decltype(&ATEN_FN(rand_like)), &ATEN_FN(rand_like), TENSOR_LIKE_COMMON_ARG_TYPES>);
   m.impl("randn_like", tensor_like_random_batch_rule<decltype(&ATEN_FN(randn_like)), &ATEN_FN(randn_like), TENSOR_LIKE_COMMON_ARG_TYPES>);
-  
+
   #undef RANDOM_BATCH_RULE
   #undef RANDOM_BATCH_RULE2
   #undef RANDOM_INPLACE_BATCH_RULE
diff --git a/functorch/functorch/csrc/PlumbingHelper.h b/functorch/functorch/csrc/PlumbingHelper.h
index 7a2d4be..8a8441c 100644
--- a/functorch/functorch/csrc/PlumbingHelper.h
+++ b/functorch/functorch/csrc/PlumbingHelper.h
@@ -37,4 +37,3 @@
 }
 
 }}
-
diff --git a/functorch/notebooks/_src/plot_ensembling.py b/functorch/notebooks/_src/plot_ensembling.py
index 18ddbf5..94cd115 100644
--- a/functorch/notebooks/_src/plot_ensembling.py
+++ b/functorch/notebooks/_src/plot_ensembling.py
@@ -19,7 +19,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from functools import partial
 torch.manual_seed(0)
 
 # Here's a simple CNN
diff --git a/functorch/notebooks/_src/plot_jacobians_and_hessians.py b/functorch/notebooks/_src/plot_jacobians_and_hessians.py
index 27e5c44..99db815 100644
--- a/functorch/notebooks/_src/plot_jacobians_and_hessians.py
+++ b/functorch/notebooks/_src/plot_jacobians_and_hessians.py
@@ -9,7 +9,6 @@
 provides ways of computing various higher-order autodiff quantities efficiently.
 """
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 from functools import partial
 torch.manual_seed(0)
diff --git a/functorch/notebooks/_src/plot_per_sample_gradients.py b/functorch/notebooks/_src/plot_per_sample_gradients.py
index 3c275a6..0feb2b8 100644
--- a/functorch/notebooks/_src/plot_per_sample_gradients.py
+++ b/functorch/notebooks/_src/plot_per_sample_gradients.py
@@ -12,7 +12,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from functools import partial
 torch.manual_seed(0)
 
 # Here's a simple CNN
diff --git a/functorch/notebooks/colab/readme.md b/functorch/notebooks/colab/readme.md
index b3feae5..fbdf129 100644
--- a/functorch/notebooks/colab/readme.md
+++ b/functorch/notebooks/colab/readme.md
@@ -1,5 +1,5 @@
-### Holds the colab ready versions of the notebook tutorials.  
+### Holds the colab ready versions of the notebook tutorials.
 
 These are similar to the jupyter notebooks, but have additional colab specific changes including the building of functorch in colab to prep for running.
 
-The colabs and notebooks are not auto-synced atm, thus currently updates to one need to be synched to the other. 
+The colabs and notebooks are not auto-synced atm, thus currently updates to one need to be synched to the other.
diff --git a/functorch/writing_batching_rules.md b/functorch/writing_batching_rules.md
index 74ccf73..5f571c4 100644
--- a/functorch/writing_batching_rules.md
+++ b/functorch/writing_batching_rules.md
@@ -96,6 +96,3 @@
 1. [BatchingRegistrations.cpp](functorch/csrc/BatchingRegistrations.cpp): This is probably the easiest place to start. These were batching rules that were written with an old API, and thus have a lot of cruft in them that are no longer necessary. Porting these batching rules to using one of the above options is an easy way to get started and help us reduce tech debt :) Once you've gotten your footing with writing batching rules, you can start helping with writing new batching rules.
 2. Popular operators. See [1](https://github.com/facebookresearch/functorch/issues/112), [2](https://github.com/facebookresearch/functorch/issues/101), [3](https://github.com/facebookresearch/functorch/issues/102), and [4](https://github.com/facebookresearch/functorch/issues/102). These contain lists of (user-facing) PyTorch operators sorted by usages, along with whether they have a batching rule implemented or not.
 3. [Master List](https://docs.google.com/spreadsheets/d/1Sp4HUjxwMifS5oDQg0yvjqk7hKOpCfKO4jWH4MTGP-k/edit#gid=0). This is the master list of vmap operator support :). It's generated by [this script](op_analysis/gen_data.py). Theoretically, we want to support most of the operators in that list (that aren't composite or out variants).
-
-
-