{- github-backup
-
- Copyright 2012-2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE PackageImports #-}
module Main where
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import Data.Either
import Data.String
import GHC.Exts (toList)
import Data.Vector (Vector)
import Options.Applicative
import Text.Show.Pretty
import "mtl" Control.Monad.State.Strict
import qualified GitHub.Data.Name as Github
import qualified GitHub.Auth as Github
import qualified GitHub.Data.Repos as Github
import qualified GitHub.Endpoints.Repos as Github
import qualified GitHub.Endpoints.Repos.Forks as Github
import qualified GitHub.Endpoints.PullRequests as Github
import qualified GitHub.Endpoints.Activity.Watching as Github
import qualified GitHub.Endpoints.Activity.Starring as Github
import qualified GitHub.Data.Definitions as Github ()
import qualified GitHub.Endpoints.Issues as Github
import qualified GitHub.Endpoints.Issues.Comments
import qualified GitHub.Endpoints.Issues.Milestones
import Common
import Utility.State
import qualified Git
import qualified Git.Construct
import qualified Git.Config
import qualified Git.Types
import qualified Git.Command
import qualified Git.Ref
import qualified Git.Branch
import qualified Git.UpdateIndex
import Github.GetAuth
import Github.EnumRepos
import Git.HashObject
import Git.FilePath
import Git.CatFile
import Git.Index
import Utility.Env
import Utility.FileSystemEncoding
repoUrl :: GithubUserRepo -> String
repoUrl (GithubUserRepo user remote) =
"https://github.com/" ++ user ++ "/" ++ remote ++ ".git"
repoWikiUrl :: GithubUserRepo -> String
repoWikiUrl (GithubUserRepo user remote) =
"https://github.com/" ++ user ++ "/" ++ remote ++ ".wiki.git"
-- A name for a github api call.
type ApiName = String
-- A request to make of github. It may have an extra parameter.
data Request = RequestSimple ApiName GithubUserRepo
| RequestNum ApiName GithubUserRepo Int
deriving (Eq, Show, Read, Ord)
requestRepo :: Request -> GithubUserRepo
requestRepo (RequestSimple _ repo) = repo
requestRepo (RequestNum _ repo _) = repo
requestName :: Request -> String
requestName (RequestSimple name _) = name
requestName (RequestNum name _ _) = name
data BackupState = BackupState
{ failedRequests :: S.Set Request
, retriedRequests :: S.Set Request
, retriedFailed :: S.Set Request
, gitRepo :: Git.Repo
, gitRepoRemotes :: [Git.Repo]
, gitHubAuth :: Maybe Github.Auth
, deferredBackups :: [Backup ()]
, catFileHandle :: Maybe CatFileHandle
, noForks :: Bool
}
{- Our monad. -}
newtype Backup a = Backup { runBackup :: StateT BackupState IO a }
deriving (
Monad,
MonadState BackupState,
MonadIO,
Functor,
Applicative
)
inRepo :: (Git.Repo -> IO a) -> Backup a
inRepo a = liftIO . a =<< getState gitRepo
failedRequest :: Request -> Github.Error -> Backup ()
failedRequest req e = unless ignorable $ do
set <- getState failedRequests
changeState $ \s -> s { failedRequests = S.insert req set }
where
-- "410 Gone" is used for repos that have issues etc disabled.
ignorable = "410 Gone" `isInfixOf` show e
runRequest :: Request -> Backup ()
runRequest req = do
-- avoid re-running requests that were already retried
retried <- getState retriedRequests
unless (S.member req retried) $
(lookupApi req) req
type Storer = Request -> Backup ()
data ApiListItem = ApiListItem ApiName Storer Bool
apiList :: [ApiListItem]
apiList =
[ ApiListItem "watchers" watchersStore True
, ApiListItem "stargazers" stargazersStore True
, ApiListItem "pullrequests" pullrequestsStore True
, ApiListItem "pullrequest" pullrequestStore False
, ApiListItem "milestones" milestonesStore True
, ApiListItem "issues" issuesStore True
, ApiListItem "issuecomments" issuecommentsStore False
-- Recursive things last.
, ApiListItem "userrepo" userrepoStore True
, ApiListItem "forks" forksStore True
]
{- Map of Github api calls we can make to store their data. -}
api :: M.Map ApiName Storer
api = M.fromList $ map (\(ApiListItem n s _) -> (n, s)) apiList
{- List of toplevel api calls that are followed to get data. -}
toplevelApi :: [ApiName]
toplevelApi = map (\(ApiListItem n _ _) -> n) $
filter (\(ApiListItem _ _ toplevel) -> toplevel) apiList
lookupApi :: Request -> Storer
lookupApi req = fromMaybe bad $ M.lookup name api
where
name = requestName req
bad = error $ "internal error: bad api call: " ++ name
watchersStore :: Storer
watchersStore = simpleHelper "watchers" Github.watchersFor' $
storeSorted "watchers"
stargazersStore :: Storer
stargazersStore = simpleHelper "stargazers" Github.stargazersFor $
storeSorted "stargazers"
pullrequestsStore :: Storer
pullrequestsStore = simpleHelper "pullrequest" Github.pullRequestsFor' $
forValues $ \req r -> do
let repo = requestRepo req
let n = Github.unIssueNumber $ Github.simplePullRequestNumber r
runRequest $ RequestNum "pullrequest" repo n
pullrequestStore :: Storer
pullrequestStore = numHelper "pullrequest" call $ \n ->
store ("pullrequest" </> show n)
where
call auth user repo n = Github.pullRequest' auth
(fromString user)
(fromString repo)
(Github.IssueNumber n)
milestonesStore :: Storer
milestonesStore = simpleHelper "milestone" GitHub.Endpoints.Issues.Milestones.milestones' $
forValues $ \req m -> do
let n = Github.milestoneNumber m
store ("milestone" </> show n) req m
issuesStore :: Storer
issuesStore = withHelper "issue"
(\a u r -> Github.issuesForRepo' a (fromString u) (fromString r))
Github.stateAll go
where
go = forValues $ \req i -> do
let repo = requestRepo req
let n = Github.unIssueNumber $ Github.issueNumber i
store ("issue" </> show n) req i
runRequest (RequestNum "issuecomments" repo n)
issuecommentsStore :: Storer
issuecommentsStore = numHelper "issuecomments" call $ \n ->
forValues $ \req c -> do
let i = Github.issueCommentId c
store ("issue" </> show n ++ "_comment" </> show i) req c
where
call auth user repo n = GitHub.Endpoints.Issues.Comments.comments' auth
(fromString user)
(fromString repo)
(Github.IssueNumber n)
userrepoStore :: Storer
userrepoStore = simpleHelper "repo" Github.repository' $ \req r -> do
store "repo" req r
when (Github.repoHasWiki r == Just True) $
updateWiki $ toGithubUserRepo r
maybe noop addFork $ Github.repoParent r
maybe noop addFork $ Github.repoSource r
forksStore :: Storer
forksStore = simpleHelper "forks" Github.forksFor' $ \req fs -> do
storeSorted "forks" req fs
mapM_ addFork fs
forValues :: (Request -> v -> Backup ()) -> Request -> Vector v -> Backup ()
forValues a req vs = forM_ (toList vs) (a req)
type ApiCall v = Maybe Github.Auth -> Github.Name Github.Owner -> Github.Name Github.Repo -> IO (Either Github.Error v)
type ApiWith v b = Maybe Github.Auth -> String -> String -> b -> IO (Either Github.Error v)
type ApiNum v = ApiWith v Int
type Handler v = Request -> v -> Backup ()
type Helper = Request -> Backup ()
simpleHelper :: FilePath -> ApiCall v -> Handler v -> Helper
simpleHelper dest call handler req@(RequestSimple _ (GithubUserRepo user repo)) =
deferOn dest req $ do
auth <- getState gitHubAuth
either (failedRequest req) (handler req)
=<< liftIO (call auth (fromString user) (fromString repo))
simpleHelper _ _ _ r = badRequest r
withHelper :: FilePath -> ApiWith v b -> b -> Handler v -> Helper
withHelper dest call b handler req@(RequestSimple _ (GithubUserRepo user repo)) =
deferOn dest req $ do
auth <- getState gitHubAuth
either (failedRequest req) (handler req) =<< liftIO (call auth user repo b)
withHelper _ _ _ _ r = badRequest r
numHelper :: FilePath -> ApiNum v -> (Int -> Handler v) -> Helper
numHelper dest call handler req@(RequestNum _ (GithubUserRepo user repo) num) =
deferOn dest req $ do
auth <- getState gitHubAuth
either (failedRequest req) (handler num req)
=<< liftIO (call auth user repo num)
numHelper _ _ _ r = badRequest r
badRequest :: Request -> a
badRequest r = error $ "internal error: bad request type " ++ show r
{- When the specified file or directory already exists in git, the action
- is deferred until later. -}
deferOn :: FilePath -> Request -> Backup () -> Backup ()
deferOn f req a = ifM (ingit $ storeLocation f req)
( changeState $ \s -> s { deferredBackups = a : deferredBackups s }
, a
)
where
ingit f' = do
h <- getCatFileHandle
liftIO $ isJust <$> catObjectDetails h
(Git.Types.Ref $ Git.Types.fromRef branchname ++ ":" ++ f')
getCatFileHandle :: Backup CatFileHandle
getCatFileHandle = go =<< getState catFileHandle
where
go (Just h) = return h
go Nothing = do
h <- withIndex $ inRepo catFileStart
changeState $ \s -> s { catFileHandle = Just h }
return h
store :: Show a => FilePath -> Request -> a -> Backup ()
store filebase req val = do
file <- (</>)
<$> workDir
<*> pure (storeLocation filebase req)
liftIO $ do
createDirectoryIfMissing True (takeDirectory file)
writeFile file (ppShow val)
storeLocation :: FilePath -> Request -> FilePath
storeLocation filebase = location . requestRepo
where
location (GithubUserRepo user repo) =
user ++ "_" ++ repo </> filebase
workDir :: Backup FilePath
workDir = (</>)
<$> (Git.repoPath <$> getState gitRepo)
<*> pure "github-backup.tmp"
storeSorted :: Ord a => Show a => FilePath -> Request -> Vector a -> Backup ()
storeSorted file req val = store file req (sort $ toList val)
{- Commits all files in the workDir into the github branch, and deletes the
- workDir.
-
- The commit is made to the github branch without ever checking it out,
- or otherwise disturbing the work tree.
-}
commitWorkDir :: Backup ()
commitWorkDir = do
dir <- workDir
whenM (liftIO $ doesDirectoryExist dir) $ do
branchref <- getBranch
withIndex $ do
r <- getState gitRepo
liftIO $ do
-- Reset index to current content of github
-- branch. Does not touch work tree.
Git.Command.run
[Param "reset", Param "-q", Param $ Git.Types.fromRef branchref, File "." ] r
-- Stage workDir files into the index.
h <- hashObjectStart r
Git.UpdateIndex.streamUpdateIndex r
[genstream dir h]
hashObjectStop h
-- Commit
void $ Git.Branch.commit Git.Branch.AutomaticCommit False "github-backup" fullname [branchref] r
removeDirectoryRecursive dir
where
genstream dir h streamer = do
fs <- filter (not . dirCruft) <$> dirContentsRecursive dir
forM_ fs $ \f -> do
sha <- hashFile h f
path <- asTopFilePath <$> relPathDirToFile dir f
streamer $ Git.UpdateIndex.updateIndexLine
sha Git.Types.TreeFile path
{- Returns the ref of the github branch, creating it first if necessary. -}
getBranch :: Backup Git.Ref
getBranch = maybe (hasOrigin >>= create) return =<< branchsha
where
create True = do
inRepo $ Git.Command.run
[Param "branch", Param $ Git.Types.fromRef branchname, Param $ Git.Types.fromRef originname]
fromMaybe (error $ "failed to create " ++ Git.Types.fromRef branchname)
<$> branchsha
create False = withIndex $
inRepo $ Git.Branch.commitAlways Git.Branch.AutomaticCommit "branch created" fullname []
branchsha = inRepo $ Git.Ref.sha fullname
{- Runs an action with a different index file, used for the github branch. -}
withIndex :: Backup a -> Backup a
withIndex a = do
r <- getState gitRepo
f <- liftIO $ indexEnvVal $ Git.localGitDir r </> "github-backup.index"
e <- liftIO getEnvironment
let r' = r { Git.Types.gitEnv = Just $ (indexEnv, f):e }
changeState $ \s -> s { gitRepo = r' }
v <- a
changeState $ \s -> s { gitRepo = (gitRepo s) { Git.Types.gitEnv = Git.Types.gitEnv r } }
return v
branchname :: Git.Ref
branchname = Git.Ref "github"
fullname :: Git.Ref
fullname = Git.Ref $ "refs/heads/" ++ Git.Types.fromRef branchname
originname :: Git.Ref
originname = Git.Ref $ "refs/remotes/origin/" ++ Git.Types.fromRef branchname
hasOrigin :: Backup Bool
hasOrigin = inRepo $ Git.Ref.exists originname
updateWiki :: GithubUserRepo -> Backup ()
updateWiki fork =
ifM (any (\r -> Git.remoteName r == Just remote) <$> getState gitRepoRemotes)
( void fetchwiki
, void $
-- github often does not really have a wiki,
-- don't bloat config if there is none
unlessM (addRemote remote $ repoWikiUrl fork) $
removeRemote remote
)
where
fetchwiki = inRepo $ Git.Command.runBool [Param "fetch", Param remote]
remote = remoteFor fork
remoteFor (GithubUserRepo user repo) =
"github_" ++ user ++ "_" ++ repo ++ ".wiki"
addFork :: ToGithubUserRepo a => a -> Backup ()
addFork forksource = unlessM (elem fork . gitHubRemotes <$> getState gitRepoRemotes) $ do
liftIO $ putStrLn $ "New fork: " ++ repoUrl fork
void $ addRemote (remoteFor fork) (repoUrl fork)
gitRepo' <- inRepo $ Git.Config.reRead
changeState $ \s -> s { gitRepo = gitRepo' }
gatherMetaData fork
where
fork = toGithubUserRepo forksource
remoteFor (GithubUserRepo user repo) = "github_" ++ user ++ "_" ++ repo
{- Adds a remote, also fetching from it. -}
addRemote :: String -> String -> Backup Bool
addRemote remotename remoteurl =
inRepo $ Git.Command.runBool
[ Param "remote"
, Param "add"
, Param "-f"
, Param remotename
, Param remoteurl
]
removeRemote :: String -> Backup ()
removeRemote remotename = void $ inRepo $ Git.Command.runBool
[ Param "remote"
, Param "rm"
, Param remotename
]
{- Fetches from the github remote. Done by github-backup, just because
- it would be weird for a backup to not fetch all available data.
- Even though its real focus is on metadata not stored in git. -}
fetchRepo :: Git.Repo -> Backup Bool
fetchRepo repo = inRepo $ Git.Command.runBool
[Param "fetch", Param $ fromJust $ Git.Types.remoteName repo]
gatherMetaData :: GithubUserRepo -> Backup ()
gatherMetaData repo = do
noforks <- getState noForks
liftIO $ putStrLn $ "Gathering metadata for " ++ repoUrl repo ++ " ..."
mapM_ call (filter (forksfilter noforks) toplevelApi)
where
call name = runRequest $ RequestSimple name repo
forksfilter noforks name = not (noforks && name == "forks")
storeRetry :: [Request] -> Git.Repo -> IO ()
storeRetry [] r = void $ do
try $ removeFile (retryFile r) :: IO (Either SomeException ())
storeRetry retryrequests r = writeFile (retryFile r) (show retryrequests)
loadRetry :: Git.Repo -> IO [Request]
loadRetry r = maybe [] (fromMaybe [] . readish)
<$> catchMaybeIO (readFileStrict (retryFile r))
retryFile :: Git.Repo -> FilePath
retryFile r = Git.localGitDir r </> "github-backup.todo"
retry :: Backup ()
retry = do
todo <- inRepo loadRetry
unless (null todo) $ do
liftIO $ putStrLn $
"Retrying " ++ show (length todo) ++
" requests that failed last time..."
mapM_ runRequest todo
changeState $ \s -> s
{ retriedFailed = failedRequests s
, failedRequests = S.empty
, retriedRequests = S.fromList todo
}
summarizeRequests :: [Request] -> [String]
summarizeRequests = go M.empty
where
go m [] = map format $ sort $ map swap $ M.toList m
go m (r:rs) = go (M.insertWith (+) (requestName r) (1 :: Integer) m) rs
format (num, name) = show num ++ "\t" ++ name
swap (a, b) = (b, a)
{- Save all backup data. Files that were written to the workDir are committed.
- Requests that failed are saved for next time. Requests that were retried
- this time and failed are ordered last, to ensure that we don't get stuck
- retrying the same requests and not making progress when run again.
-
- Returns any requests that failed.
-}
save :: Backup [Request]
save = do
commitWorkDir
failed <- getState failedRequests
retriedfailed <- getState retriedFailed
let toretry = S.toList failed ++ S.toList retriedfailed
inRepo $ storeRetry toretry
endState
return toretry
showFailures :: [Request] -> IO ()
showFailures [] = noop
showFailures l = hPutStrLn stderr $ unlines $
["Backup may be incomplete; " ++
show (length l) ++ " requests failed:"
] ++ map (" " ++) (summarizeRequests l) ++
[ "Run again later."
]
newState :: Bool -> Git.Repo -> IO BackupState
newState noforks r = BackupState
<$> pure S.empty
<*> pure S.empty
<*> pure S.empty
<*> pure r
<*> Git.Construct.fromRemotes r
<*> getAuth
<*> pure []
<*> pure Nothing
<*> pure noforks
endState :: Backup ()
endState = liftIO . maybe noop catFileStop =<< getState catFileHandle
genBackupState :: Bool -> Git.Repo -> IO BackupState
genBackupState noforks repo = newState noforks =<< Git.Config.read repo
backupRepo :: Bool -> (Maybe Git.Repo) -> IO ()
backupRepo _ Nothing = error "not in a git repository, and nothing specified to back up"
backupRepo noforks (Just repo) =
genBackupState noforks repo >>= evalStateT (runBackup go) >>= showFailures
where
go = do
retry
mainBackup
runDeferred
save
mainBackup :: Backup ()
mainBackup = do
remotes <- gitHubPairs <$> getState gitRepoRemotes
when (null remotes) $
error "no github remotes found"
forM_ remotes $ \(r, remote) -> do
void $ fetchRepo r
gatherMetaData remote
runDeferred :: Backup ()
runDeferred = go =<< getState deferredBackups
where
go [] = noop
go l = do
changeState $ \s -> s { deferredBackups = [] }
void $ sequence l
-- Running the deferred actions could cause
-- more actions to be deferred; run them too.
runDeferred
backupOwner :: Bool -> [GithubUserRepo] -> Owner -> IO ()
backupOwner noforks exclude (Owner name) = do
auth <- getAuth
l <- sequence
[ Github.userRepos' auth (fromString name) Github.RepoPublicityAll
, Github.reposWatchedBy' auth (fromString name)
, Github.reposStarredBy auth (fromString name)
, Github.organizationRepos' auth (fromString name) Github.RepoPublicityAll
]
let nameurls = nub $ mapMaybe makenameurl $ concatMap toList $ rights l
when (null nameurls) $
if (null $ rights l)
then error $ unlines $ "Failed to query github for repos:" : map show (lefts l)
else error $ "No GitHub repositories found for " ++ name
-- Clone any missing repos, and get a BackupState for each repo
-- that is to be backed up.
states <- catMaybes <$> forM nameurls prepare
-- First pass only retries things that failed before, so the
-- retried actions will run in each repo before too much API is
-- used up.
states' <- forM states (execStateT . runBackup $ retry)
states'' <- forM states' (execStateT . runBackup $ mainBackup)
forM states'' (evalStateT . runBackup $ runDeferred >> save)
>>= showFailures . concat
where
excludeurls = map repoUrl exclude
makenameurl :: Github.Repo -> Maybe (String, String)
makenameurl repo =
case Github.repoGitUrl repo of
Just url -> Just (T.unpack $ Github.untagName $ Github.repoName repo, T.unpack $ Github.getUrl url)
Nothing -> Nothing
prepare (dir, url)
| url `elem` excludeurls = return Nothing
| otherwise = do
unlessM (doesDirectoryExist dir) $ do
putStrLn $ "New repository: " ++ dir
ok <- boolSystem "git"
[ Param "clone"
, Param url
, Param dir
]
unless ok $ error "clone failed"
Just <$> (genBackupState noforks =<< Git.Construct.fromPath dir)
data Options = Options
{ includeOwner :: [Owner]
, excludeRepo :: [GithubUserRepo]
, noForksOpt :: Bool
}
deriving (Show)
data Owner = Owner String
deriving (Show)
options :: Parser Options
options = Options <$> many owneropt <*> many excludeopt <*> noforksopt
where
owneropt = Owner <$> (argument str)
( metavar "USERNAME|ORGANIZATION"
<> help "Back up repositories owned by this entity."
)
excludeopt = parseUserRepo <$> (strOption
( long "exclude"
<> metavar "USERNAME/REPOSITORY"
<> help "Skip backing up a repository."
))
noforksopt = switch
( long "no-forks"
<> help "Do not backup forks."
)
parseUserRepo :: String -> GithubUserRepo
parseUserRepo s =
let (user, repo) = separate (== '/') s
in GithubUserRepo user repo
main :: IO ()
main = do
useFileSystemEncoding
execParser opts >>= go
where
opts = info (helper <*> options)
( fullDesc
<> progDesc desc
<> header "github-backup - backs up data from GitHub"
)
desc = unlines
[ "Backs up all forks, issues, etc of a GitHub repository."
, "Run without any parameters inside a clone of a repository to back it up."
, "Or, specify whose repositories to back up."
]
go (Options owner exclude noforks)
| null owner = backupRepo noforks =<< Git.Construct.fromCwd
| otherwise = mapM_ (backupOwner noforks exclude) owner