Source code for materials_commons.cli.treedb

import os
import sqlite3
import time
import warnings

from tabulate import tabulate

import materials_commons.api as mcapi
import materials_commons.cli.exceptions as cliexcept
import materials_commons.cli.functions as clifuncs
import materials_commons.cli.file_functions as filefuncs
from materials_commons.cli.sqltable import SqlTable, sql_iter


# tree:
# path, name, parent_path, mtime, size, checksum, otype, id

[docs]def do_with_queue(f, queue): while len(queue): f(queue.pop(), queue)
def _print_children_table(all_children, treename='remote'): headers = ['path', 'name', treename, 'db'] print_data = {key:[] for key in headers} for path in all_children: remote_exists = ('tree' in all_children[path]) db_exists = ('db' in all_children[path]) name = all_children if remote_exists: name = all_children[path]['tree']['name'] else: name = all_children[path]['db']['name'] print_data['path'].append(path) print_data['name'].append(name) print_data[treename].append(remote_exists) print_data['db'].append(db_exists) print('Children:') print(tabulate(print_data, headers=headers)) print()
[docs]class TreeTable(SqlTable): """Store information on files and directories in a tree This is a base class. Derived classes must implement: - @staticmethod default_print_fmt(): list of tuple, see an example - @staticmethod treename(): str, tree label for printing ('remote', 'local') - @staticmethod tablename(): str, tree table name in sqlite database ('remotetree', 'localtree') - needs_update(self, existing): bool, check if record needs updating - _check(self, path, get_children=True, parent_path=None): (dir, children), see an example Values: path: str, path including project directory name: str, file or directory name parent_path: str, path of parent directory (or None for 'top') otype: str, 'file' or 'directory' mtime: real, file or directory modify time (s since epoch) checktime: real, last time the record was updated (s since epoch) size: integer, file size checksum: str, md5 hash id: str, ID string, depends on type of tree. For `RemoteTree` this is the Materials Commons id. For `LocalTree` this is None. parent_id: str, ID string, depends on type of tree. For `RemoteTree` this is the Materials Commons id. For `LocalTree` this is None. """ # column name: ([type, (optional) constraints], fmt)
[docs] @staticmethod def tablecolumns(): return { "id": ["text"], "parent_id": ["text"], "name": ["text"], "path": ["text", "UNIQUE"], "parent_path": ["text"], "mtime": ["real"], "size": ["integer"], "checksum": ["text"], "otype": ["text"], # "file" or "directory" "checktime": ["real"] # last time the remote data was queried (s since epoch) }
def __init__(self, proj_local_path): """ Arguments: proj_local_path: str Path to Materials Commons project directory, for storing sqlite db file """ super(TreeTable, self).__init__(proj_local_path) def _delete_one_by_path(self, path, verbose=False): """Delete one record by path""" if verbose: print('Deleting ' + path + ' from db... ', end='') self.curs.execute("DELETE FROM " + self.tablename() + " WHERE path=?", (path,)) self.conn.commit() if verbose: print('DONE') def _delete_recurs_by_path(self, path, verbose=False): """Delete record by path, and children recursively""" def f(path, queue): self._delete_one_by_path(path, verbose=verbose) for record in self.select_by_parent_path(path): queue.add(record["path"]) queue = set([path]) do_with_queue(f, queue)
[docs] def delete_by_path(self, path, recurs=False, verbose=False): """Delete individual entry by path Arguments: path: str The Materials Commons path of the file_or_dir to delete. recurs: bool If True, delete children recursively verbose: bool If True, print status. """ if recurs: self._delete_recurs_by_path(path, verbose=verbose) else: self._delete_one_by_path(path, verbose=verbose)
[docs] def update(self, path, get_children=True, recurs=False, verbose=False, force=False): """Update tree table to accurately reflect a particular directory Arguments: path: str Materials Commons path to the directory to update (includes project directory). get_children: bool If True, also update children files and directories. recurs: bool If True, updating children is done recursively. verbose: bool If True, print status. """ if verbose: print(path, end='') existing = None res = self.select_by_path(path) if len(res) == 0: pass elif len(res) == 1: existing = res[0] if not force and not self.needs_update(existing): return elif len(res) > 1: raise cliexcept.MCCLIException("Error in TableTree.update: >1 record for path") checktime = time.time() file_or_dir, children = self._check(path, checktime=checktime, get_children=get_children) if verbose: if file_or_dir is None: print(' -> Does not exist in tree') else: print(' -> Found') # insert / replace / remove # if the file or directory does not exist anymore, remove it recursively if file_or_dir is None: if existing: self.delete_by_path(existing['path'], recurs=True, verbose=verbose) # NOTE: in some cases this might avoid re-checking paths that have been found not to # exist, but it might not be the best way to do this self.insert_non_existent(path, checktime=checktime, verbose=verbose) return else: self.insert_or_replace(file_or_dir, verbose=verbose) # check children, optionally if get_children: # children that only exist in the database must be deleted, # those that exist outside the database should be inserted or replaced all_children = {} def _insert_child(child, category): if child['path'] not in all_children: all_children[child['path']] = dict() all_children[child['path']][category] = child for child in children: _insert_child(child, 'tree') for child in self.select_by_parent_path(file_or_dir['path']): _insert_child(child, 'db') # print table of children if verbose: _print_children_table(all_children, self.treename()) for path in all_children: child = all_children[path].get('tree', None) # if child does not exist in tree, then delete it if child is None: self.delete_by_path(path, recurs=True) # if child is a file that exists in tree elif child['otype'] == 'file': self.insert_or_replace(child, verbose=verbose) # elif child is a directory that exists remotely elif child['otype'] == 'directory': if recurs: # if not recursive, do not update other next children, so children=recurs self.update(child['path'], get_children=recurs, recurs=recurs, verbose=verbose, force=force) else: self.insert_or_replace(child, verbose=verbose) else: raise Exception("TreeTable.update error: Unknown error updating children") return
[docs] def select_all(self, fetchsize=1000): """Select all records Yields: sqlite3.Row or None, if no records left """ self.curs.execute("SELECT * FROM " + self.tablename()) for r in sql_iter(self.curs, fetchsize=fetchsize): #pylint: disable=invalid-name yield r
[docs] def select_by_path(self, path): """Select records by path Returns: List of sqlite3.Row """ self.curs.execute("SELECT * FROM " + self.tablename() + " WHERE path=?", (path, )) return self.curs.fetchall()
[docs] def select_by_id(self, id): """Select record by id Returns: sqlite3.Row or None """ self.curs.execute("SELECT * FROM " + self.tablename() + " WHERE id=?", (id, )) return self.curs.fetchone()
[docs] def select_by_parent_path(self, parent_path): """Select records by parent_path Returns: List of sqlite3.Row """ self.curs.execute("SELECT * FROM " + self.tablename() + " WHERE parent_path=?", (parent_path, )) return self.curs.fetchall()
[docs] def select_by_parent_id(self, parent_id): """Select records by parent_id Returns: List of sqlite3.Row """ self.curs.execute("SELECT * FROM " + self.tablename() + " WHERE parent_id=?", (parent_path, )) return self.curs.fetchall()
def _walk_results(self, path): results = self.select_by_path(path) if not len(results) == 1: raise cliexcept.MCCLIException("TreeTable.walk error: multiple records for path '" + path + "'") top = results[0] if not top['otype'] == 'directory': raise cliexcept.MCCLIException("TreeTable.walk error: '" + path + "' is not a directory") children = self.select_by_parent_path(top['path']) dirs = [x for x in children if x['otype'] == 'directory'] files = [x for x in children if x['otype'] == 'file'] return (top, dirs, files)
[docs] def walk(self, path, topdown=True): """Yields (parent, dirs, files) records recursively starting from directory at `path`""" (root, dirs, files) = self._walk_results(path) if topdown: yield (root, dirs, files) for dir in dirs: yield self.walk(dir['path'], topdown=topdown) if not topdown: yield (root, dirs, files)
[docs]class RemoteTree(TreeTable): """Store information on files and directories in the Materials Commons project Values: id: str, Materials Commons id string parent_id: str, Materials Commons id string path: str, path including project directory name: str, file or directory name otype: str, 'file' or 'directory' mtime: real, file or directory modify time (s since epoch) checktime: real, last time the record was updated (s since epoch) size: integer, file size checksum: str, md5 hash parent_path: str, path of parent directory (or none for 'top') """
[docs] @staticmethod def default_print_fmt(): from materials_commons.cli.functions import as_is, format_time, humanize # (key, header, fmt, size, function) return [ ("path", "path","<", 80, as_is), ("name", "name", "<", 24, as_is), ("otype", "otype", "<", 16, as_is), ("mtime", "mtime", "<", 24, format_time), ("checktime", "checktime", "<", 24, format_time), ("size", "size", "<", 8, humanize), ("checksum", "checksum", "<", 36, as_is), ("id", "id", "<", 36, as_is), ("parent_id", "parent_id", "<", 36, as_is), ("parent_path", "parent_path", "<", 80, as_is) ]
[docs] @staticmethod def treename(): return "remote"
[docs] @staticmethod def tablename(): return "remotetree"
def __init__(self, proj, updatetime): super(RemoteTree, self).__init__(proj.local_path) self.updatetime = updatetime self.proj = proj
[docs] def needs_update(self, existing): if not existing['checktime']: return True if self.updatetime and existing['checktime'] > self.updatetime: return False return True
[docs] def insert_non_existent(self, path, checktime=None, verbose=False): """Do insert non-existent""" if path == self.proj.name: parent_path = None else: parent_path = os.path.dirname(path) if checktime is None: checktime = time.time() checktime = checktime record = { "path":path, "name": os.path.basename(path), "parent_path": parent_path, "checktime": checktime } self.insert_or_replace(record, verbose=verbose) return
def _make_record(self, file_or_dir, checktime): """Make a record dict from a mcapi.File instance Arguments: file_or_dir: mcapi.File The object to be inserted. checktime: float Time the API call to check the object was made (s since the epoch). Returns: record: dict, suitable for database insertion """ record = {} for key in self.tablecolumns(): if key == "id": if file_or_dir.id: record["id"] = str(file_or_dir.id) elif key == "parent_id": if file_or_dir.directory_id: record["parent_id"] = str(file_or_dir.directory_id) elif key == "checktime": record["checktime"] = checktime elif key == "mtime": record["mtime"] = clifuncs.epoch_time(file_or_dir.updated_at) elif key == "path": record["path"] = file_or_dir.path elif key == "parent_path": parent_path = os.path.dirname(file_or_dir.path) if parent_path != file_or_dir.path: record["parent_path"] = parent_path elif key == "otype": if filefuncs.isfile(file_or_dir): record["otype"] = "file" elif filefuncs.isdir(file_or_dir): record["otype"] = "directory" else: raise cliexcept.MCCLIException("Invalid file_or_dir type: " + str(type(file_or_dir))) elif key not in file_or_dir._data: continue elif file_or_dir._data[key] is None: continue elif isinstance(file_or_dir._data[key], str) and not file_or_dir._data[key]: continue else: record[key] = file_or_dir._data[key] return record def _check(self, path, checktime=None, get_children=True): """Get current status of file or directory at path Arguments: path: str Materials Commons path to update. get_children: boolean If True, also get children. Returns: (file_or_dir, children): file_or_dir: dict, or None None if File or Directory not found on server. Else, record representing the file or directory. children: List of dict, or None None if children not requested or is file, else records representing each child. """ file_or_dir_obj = None file_or_dir = None children = None if checktime is None: checktime = time.time() checktime = checktime if isinstance(path, str): try: file_or_dir_obj = filefuncs.get_by_path_if_exists(self.proj.remote, self.proj.id, path) if file_or_dir_obj is None: return (file_or_dir, children) if file_or_dir_obj.path is None: file_or_dir_obj.path = path file_or_dir = self._make_record(file_or_dir_obj, checktime) except cliexcept.MCCLIException: pass else: raise cliexcept.MCCLIException("Error getting Directory: 'path' is of type '" + str(type(path)) + "'") if get_children: children = [] if file_or_dir_obj is not None and filefuncs.isdir(file_or_dir_obj): for child in self.proj.remote.list_directory(self.proj.id, file_or_dir_obj.id): # TODO: child does not have 'size' or 'checksum' _checktime = checktime if filefuncs.isdir(child): child = self.proj.remote.get_directory(self.proj.id, child.id) # TODO: remove this _checktime = None if filefuncs.isfile(child): child = self.proj.remote.get_file(self.proj.id, child.id) # TODO: remove this child.path = os.path.join(path, child.name) children.append(self._make_record(child, _checktime)) return (file_or_dir, children)
[docs]class LocalTree(TreeTable): """Store information on files and directories in the working tree Values: id: None, (always None) # TODO: clean up 'id', 'parent_id', 'parent_path' parent_id: None, (always None) path: str, relative path including project directory (matches Materials Commons 'path') name: str, file or directory name otype: str, 'file' or 'directory' mtime: real, file or directory modify time (s since epoch) checktime: real, last time the record was updated (s since epoch) size: integer, file size checksum: str, md5 hash parent_path: str, path of parent directory (or none for 'top') """
[docs] @staticmethod def default_print_fmt(): from materials_commons.cli.functions import as_is, format_time, humanize # (key, header, fmt, size, function) return [ ("path", "path","<", 80, as_is), ("name", "name", "<", 24, as_is), ("otype", "otype", "<", 16, as_is), ("mtime", "mtime", "<", 24, format_time), ("checktime", "checktime", "<", 24, format_time), ("size", "size", "<", 8, humanize), ("checksum", "checksum", "<", 36, as_is), ("id", "id", "<", 80, as_is), ("parent_path", "parent_path", "<", 80, as_is) ]
[docs] @staticmethod def treename(): return "local"
[docs] @staticmethod def tablename(): return "localtree"
def __init__(self, proj_local_path): super(LocalTree, self).__init__(proj_local_path) self.proj_local_path = proj_local_path
[docs] def needs_update(self, existing): path = existing['path'] if not os.path.exists(path): return True if os.path.getmtime(path) != existing['mtime']: return True return False
[docs] def insert_non_existent(self, path, checktime=None, verbose=False): """Do not insert non-existent""" return
def _make_record(self, local_abspath, checktime): """Make a record dict for a local path Arguments: local_abspath: str Absolute path to file or directory checktime: float Time the API call to check the object was made (s since the epoch). Returns: record: dict, suitable for database insertion """ record = {} if os.path.isdir(local_abspath): record['otype'] = 'directory' elif os.path.isfile(local_abspath): record['otype'] = 'file' record['checksum'] = clifuncs.checksum(local_abspath) record['size'] = os.path.getsize(local_abspath) else: raise cliexcept.MCCLIException("LocalTree._make_record error: 'local_abspath'='" + local_abspath + "' is not a file or directory.") record['path'] = filefuncs.make_mcpath(self.proj_local_path, local_abspath) if local_abspath == self.proj_local_path: record['parent_path'] = None record['name'] = "/" else: record['parent_path'] = os.path.dirname(record['path']) record['name'] = os.path.basename(record['path']) record['mtime'] = os.path.getmtime(local_abspath) record['checktime'] = checktime return record def _check(self, path, checktime=None, get_children=True): """Get current status of a directory Arguments: path: str Materials Commons path to update. checktime: float When the status is being checked get_children: boolean If True, also get children. Returns: (dir, children): dir: dict, or None None if Directory not found on server. Else, record representing the directory. children: List of dict, or None None if children not requested, else records representing each child. """ file_or_dir = None children = None local_abspath = filefuncs.make_local_abspath(self.proj_local_path, path) if checktime is None: checktime = time.time() checktime = checktime if isinstance(local_abspath, str): try: if not os.path.exists(local_abspath): return (file_or_dir, children) file_or_dir = self._make_record(local_abspath, checktime) except cliexcept.MCCLIException: pass else: raise cliexcept.MCCLIException("LocalTree._check error: 'path' is of type '" + str(type(path)) + "'") if get_children: children = [] if os.path.isdir(local_abspath): for child in os.listdir(local_abspath): if child == ".mc": continue children.append(self._make_record(os.path.join(local_abspath, child), checktime)) return (file_or_dir, children)